1 Data Understading & Preparation¶

In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from matplotlib.ticker import AutoMinorLocator
from matplotlib import gridspec

#scaling, normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
#kmeans, dbscan, hierarchical (sklearn)
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
#evaluation
from sklearn.metrics import silhouette_score
from sklearn import metrics

#distance matrix (dbscan elbow, hierarchical)
from scipy.spatial.distance import pdist, squareform
# hierarchical (scipy)
from scipy.cluster.hierarchy import linkage, dendrogram
#seaborn #bokeh #altair
from sklearn.decomposition import PCA

from yellowbrick.cluster import KElbowVisualizer
from google.colab import files

!pip install pyfim
from fim import apriori
Collecting pyfim
  Downloading pyfim-6.28.tar.gz (357 kB)
     |████████████████████████████████| 357 kB 13.9 MB/s 
Building wheels for collected packages: pyfim
  Building wheel for pyfim (setup.py) ... done
  Created wheel for pyfim: filename=pyfim-6.28-cp37-cp37m-linux_x86_64.whl size=537786 sha256=7f0c078240f9aada9e12e1e542a02d2fa17a14ed48ebaebe13981e6baf197a47
  Stored in directory: /root/.cache/pip/wheels/08/9f/26/09cb4efd027e46f96e0a0f33d0a74be614d3caf89c1eeb75a8
Successfully built pyfim
Installing collected packages: pyfim
Successfully installed pyfim-6.28
In [ ]:
#caricamento del dataset
df = pd.read_csv('words_glasgow.csv')
#faccio una copia del dataset in caso di manipolazione dati
dfcopy= df.copy()
In [ ]:
#visualizzazione di alcune righe per avere un'idea dei dati
#le prime righe
df.head()
Out[ ]:
word length arousal valence dominance concreteness imageability familiarity aoa semsize gender polysemy web_corpus_freq
0 abattoir 8 4.200 2.864 4.333 5.455 4.391 2.382 6.760 4.652 5.391 0 160074.0
1 abbey 5 3.125 5.781 4.667 5.906 5.344 3.324 5.177 5.121 3.303 0 4224864.0
2 abbreviate 10 3.273 5.250 5.235 3.286 3.177 5.121 5.543 2.667 3.971 0 140105.0
3 abdicate 8 4.194 3.767 4.419 3.367 2.516 3.971 6.233 4.679 5.167 0 124123.0
4 abdication 10 3.846 3.880 4.800 3.292 2.571 3.097 6.407 5.083 4.571 0 128143.0
In [ ]:
#le ultime righe
df.tail()
Out[ ]:
word length arousal valence dominance concreteness imageability familiarity aoa semsize gender polysemy web_corpus_freq
4677 zeppelin 8 6.185 5.000 5.333 6.286 6.185 3.167 6.036 5.464 5.269 0 3192943.0
4678 zero 4 4.031 4.182 4.567 2.688 3.903 6.269 2.636 1.758 4.172 0 30735412.0
4679 zest 4 5.969 6.818 6.121 4.438 4.033 4.000 5.364 2.636 3.452 0 655010.0
4680 zoo 3 5.909 6.235 5.485 6.118 6.441 5.655 2.324 4.844 4.059 0 11589578.0
4681 zoology 7 4.571 5.765 5.114 4.429 3.914 4.382 5.824 4.571 3.657 0 1672374.0
In [ ]:
#visualizzazione della dimensione 
df.shape
Out[ ]:
(4682, 13)
In [ ]:
#prima ricognizione indicativa sui dati
df.describe()
Out[ ]:
length arousal valence dominance concreteness imageability familiarity aoa semsize gender polysemy web_corpus_freq
count 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4.668000e+03
mean 6.348355 4.678129 5.086797 5.044939 4.566273 4.723018 5.271335 4.143427 4.136403 4.099933 0.080948 2.988976e+07
std 2.006230 1.097163 1.594344 0.930669 1.433689 1.363110 0.921218 1.252770 1.023293 0.912293 0.272785 8.490144e+07
min 2.000000 2.057000 1.030000 1.941000 1.636000 1.737000 1.647000 1.219000 1.375000 1.000000 0.000000 1.277000e+04
25% 5.000000 3.849000 4.115000 4.529000 3.242000 3.519250 4.706000 3.114000 3.438000 3.606000 0.000000 1.671100e+06
50% 6.000000 4.571000 5.290000 5.123000 4.471000 4.677000 5.438000 4.177000 4.186500 4.121000 0.000000 5.702982e+06
75% 8.000000 5.419000 6.088000 5.600000 5.971000 6.032000 5.969000 5.152000 4.882000 4.656000 0.000000 2.232705e+07
max 16.000000 8.177000 8.647000 8.371000 6.938000 6.941000 6.939000 6.971000 6.912000 6.971000 1.000000 2.022460e+09
In [ ]:
df.describe(include=['O'])
Out[ ]:
word
count 4682
unique 4682
top toward
freq 1
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4682 entries, 0 to 4681
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   word             4682 non-null   object 
 1   length           4682 non-null   int64  
 2   arousal          4682 non-null   float64
 3   valence          4682 non-null   float64
 4   dominance        4682 non-null   float64
 5   concreteness     4682 non-null   float64
 6   imageability     4682 non-null   float64
 7   familiarity      4682 non-null   float64
 8   aoa              4682 non-null   float64
 9   semsize          4682 non-null   float64
 10  gender           4682 non-null   float64
 11  polysemy         4682 non-null   int64  
 12  web_corpus_freq  4668 non-null   float64
dtypes: float64(10), int64(2), object(1)
memory usage: 475.6+ KB

In this dataset there are 4682 unique words. Each word is examined in regards of different variables. Those variables are lenght, arousal, valence, dominance, concretness, imageability, familiarity, age of aquisition, semsize gender, polysemy and frequence in a google web corpus.`

testo in corsivo## 1.1 Data Semantics

In [ ]:
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
In [ ]:
first=df.loc[:,"familiarity":]
aoa_descr=first.describe()

print(aoa_descr.to_latex())
\begin{tabular}{lrrrrrr}
\toprule
{} &  familiarity &          aoa &      semsize &       gender &     polysemy &  web\_corpus\_freq \\
\midrule
count &  4682.000000 &  4682.000000 &  4682.000000 &  4682.000000 &  4682.000000 &     4.668000e+03 \\
mean  &     5.271335 &     4.143427 &     4.136403 &     4.099933 &     0.080948 &     2.988976e+07 \\
std   &     0.921218 &     1.252770 &     1.023293 &     0.912293 &     0.272785 &     8.490144e+07 \\
min   &     1.647000 &     1.219000 &     1.375000 &     1.000000 &     0.000000 &     1.277000e+04 \\
25\%   &     4.706000 &     3.114000 &     3.438000 &     3.606000 &     0.000000 &     1.671100e+06 \\
50\%   &     5.438000 &     4.177000 &     4.186500 &     4.121000 &     0.000000 &     5.702982e+06 \\
75\%   &     5.969000 &     5.152000 &     4.882000 &     4.656000 &     0.000000 &     2.232705e+07 \\
max   &     6.939000 &     6.971000 &     6.912000 &     6.971000 &     1.000000 &     2.022460e+09 \\
\bottomrule
\end{tabular}

testo in grassetto

Arousal¶

Brief description of the variables

In [ ]:
#NOME VARIABILE DI INTERESSE
refvar="arousal"



#calcolo frequenza assoluta per aoa
df[refvar].value_counts()

#BOXPLOT

plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])

#bellurie
boxplot_str="boxplot"
str1="distribution of"+" "+refvar
str2=boxplot_str+" "+str1
plt.title(str2, size = 18)
aoa_boxplot.set_xlabel(refvar, rotation = "horizontal", size = 16)



#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa1= df[refvar].dropna(0, inplace = False)

data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,10),edgecolor='black')

# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)

# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title(str1, size = 18)
plt.xlabel(xlabel = refvar, fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()

aoa_descr=df[refvar].describe()

print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} &      arousal \\
\midrule
count &  4682.000000 \\
mean  &     4.678129 \\
std   &     1.097163 \\
min   &     2.057000 \\
25\%   &     3.849000 \\
50\%   &     4.571000 \\
75\%   &     5.419000 \\
max   &     8.177000 \\
\bottomrule
\end{tabular}

Other comments

Valence¶

Brief description of the variables

In [ ]:
#NOME VARIABILE DI INTERESSE
refvar="valence"



#calcolo frequenza assoluta per aoa
df[refvar].value_counts()

#BOXPLOT

plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])

#bellurie
boxplot_str="boxplot"
str1="distribution of"+" "+refvar
str2=boxplot_str+" "+str1
plt.title(str2, size = 18)
aoa_boxplot.set_xlabel(refvar, rotation = "horizontal", size = 16)



#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa1= df[refvar].dropna(0, inplace = False)

data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,10),edgecolor='black')

# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)

# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title(str1, size = 18)
plt.xlabel(xlabel = refvar, fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()

aoa_descr=df[refvar].describe()

print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} &      valence \\
\midrule
count &  4682.000000 \\
mean  &     5.086797 \\
std   &     1.594344 \\
min   &     1.030000 \\
25\%   &     4.115000 \\
50\%   &     5.290000 \\
75\%   &     6.088000 \\
max   &     8.647000 \\
\bottomrule
\end{tabular}

In [ ]:

Other comments

In [ ]:

Dominance¶

Brief description of the variables

In [ ]:
refvar="dominance"
In [ ]:
#calcolo frequenza assoluta per aoa
df[refvar].value_counts()

#BOXPLOT

plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])

#bellurie
boxplot_str="boxplot"
str1="distribution of"+" "+refvar
str2=boxplot_str+" "+str1
plt.title(str2, size = 18)
aoa_boxplot.set_xlabel(refvar, rotation = "horizontal", size = 16)



#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa1= df[refvar].dropna(0, inplace = False)

data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,10),edgecolor='black')

# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)

# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title(str1, size = 18)
plt.xlabel(xlabel = refvar, fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()

aoa_descr=df[refvar].describe()

print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} &    dominance \\
\midrule
count &  4682.000000 \\
mean  &     5.044939 \\
std   &     0.930669 \\
min   &     1.941000 \\
25\%   &     4.529000 \\
50\%   &     5.123000 \\
75\%   &     5.600000 \\
max   &     8.371000 \\
\bottomrule
\end{tabular}

Other comments

Concreteness¶

(numerical): represents the degree to which something can be experienced by our senses. It ranges from 1, abstract words, to 7, concrete words.

In [ ]:
#NOME VARIABILE DI INTERESSE
refvar="concreteness"



#calcolo frequenza assoluta per aoa
df[refvar].value_counts()

#BOXPLOT

plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])

#bellurie
boxplot_str="boxplot"
str1="distribution of"+" "+refvar
str2=boxplot_str+" "+str1
plt.title(str2, size = 18)
aoa_boxplot.set_xlabel(refvar, rotation = "horizontal", size = 16)



#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa1= df[refvar].dropna(0, inplace = False)

data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,8),edgecolor='black')

# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)

# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title(str1, size = 18)
plt.xlabel(xlabel = refvar, fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()

aoa_descr=df[refvar].describe()

print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} &  concreteness \\
\midrule
count &   4682.000000 \\
mean  &      4.566273 \\
std   &      1.433689 \\
min   &      1.636000 \\
25\%   &      3.242000 \\
50\%   &      4.471000 \\
75\%   &      5.971000 \\
max   &      6.938000 \\
\bottomrule
\end{tabular}

Other comments

In [ ]:

Imageability¶

(numerical): measures how difficult is to generate a mental image of something. It ranges from 1, hard to imagine, to 7, easy to imagine.

In [ ]:
#NOME VARIABILE DI INTERESSE
refvar="imageability"



#calcolo frequenza assoluta per aoa
df[refvar].value_counts()

#BOXPLOT

plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])

#bellurie
boxplot_str="boxplot"
str1="distribution of"+" "+refvar
str2=boxplot_str+" "+str1
plt.title(str2, size = 18)
aoa_boxplot.set_xlabel(refvar, rotation = "horizontal", size = 16)



#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa1= df[refvar].dropna(0, inplace = False)

data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,8),edgecolor='black')

# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)

# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title(str1, size = 18)
plt.xlabel(xlabel = refvar, fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()

aoa_descr=df[refvar].describe()

print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} &  imageability \\
\midrule
count &   4682.000000 \\
mean  &      4.723018 \\
std   &      1.363110 \\
min   &      1.737000 \\
25\%   &      3.519250 \\
50\%   &      4.677000 \\
75\%   &      6.032000 \\
max   &      6.941000 \\
\bottomrule
\end{tabular}

Other comments

In [ ]:

Familiarity¶

(numerical): is a measure of a word’s subjective experience. The range is from 1 (unfamiliar) to 7 (familiar).

In [ ]:
#NOME VARIABILE DI INTERESSE
refvar="familiarity"



#calcolo frequenza assoluta per aoa
df[refvar].value_counts()

#BOXPLOT

plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])

#bellurie
boxplot_str="boxplot"
str1="distribution of"+" "+refvar
str2=boxplot_str+" "+str1
plt.title(str2, size = 18)
aoa_boxplot.set_xlabel(refvar, rotation = "horizontal", size = 16)



#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa1= df[refvar].dropna(0, inplace = False)

data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,8),edgecolor='black')

# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)

# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title(str1, size = 18)
plt.xlabel(xlabel = refvar, fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()

aoa_descr=df[refvar].describe()

print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} &  familiarity \\
\midrule
count &  4682.000000 \\
mean  &     5.271335 \\
std   &     0.921218 \\
min   &     1.647000 \\
25\%   &     4.706000 \\
50\%   &     5.438000 \\
75\%   &     5.969000 \\
max   &     6.939000 \\
\bottomrule
\end{tabular}

Other comments

In [ ]:

Age of Aquisition¶

The variable age of aquisition (aoa) indicates the supposed age in which that person first learned that specific word. Clearly it is not easy to remember at which age a word is learned, and for that reason in this dataset this variables refears to an estimation of the age of aquisition. The scale is defined as a series of consecutive 2-year periods from the ages of 2 and 12 years, and a final period referring to 13 years and older. This shows how we have 7 different ranges, 0-2, 2-4, 4-6, 6-8, 8-10-, 10-12 and 13+.

In [ ]:
#calcolo frequenza assoluta per aoa
df["aoa"].value_counts()
Out[ ]:
5.000    45
4.000    29
3.029    24
4.657    23
5.029    22
         ..
5.278     1
6.036     1
6.633     1
5.839     1
1.871     1
Name: aoa, Length: 953, dtype: int64
In [ ]:
#imposto lo spazio
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
plt.rcParams["figure.figsize"] = [16, 4]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
plt.subplot(1, 2, 1)
#distribuzione e tendenze centrali: boxplot
aoa_boxplot = sb.boxplot( x = "aoa", data=df, color = colors1[1])
plt.title("Boxplot distribution of age of aquisition", size = 18)
aoa_boxplot.set_xlabel("Age of aquisition", rotation = "horizontal", size = 16)
plt.subplot(1, 2, 2)

aoa1= df["aoa"].dropna(0, inplace = False)

#imposto la legenda (da rivedere)
#lables_aoa= [0-2, 2-4, 4-6, 6-8, 8-10, 10-12, 13]
#plt.legend(title="Age of aquisition ranges", title_fontsize = 14, bbox_to_anchor=(1, 1), labels=lables_aoa )

#distribuzione e tendenze centrali: istogramma
aoa_hist = plt.hist(aoa1,  color = colors1[1],edgecolor='black',bins=range(0,8))

plt.title("Distribution of aoa", size = 18)
plt.xlabel(xlabel = "Age of Aquisition", fontsize = 16)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)

plt.show()
In [ ]:
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa_boxplot = sb.boxplot( x = "aoa", data=df, color = colors1[1])
plt.title("Boxplot distribution of age of aquisition", size = 18)
aoa_boxplot.set_xlabel("Age of aquisition", rotation = "horizontal", size = 16)

plt.show()
In [ ]:
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']


#BOXPLOT

plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa_boxplot = sb.boxplot( x = "aoa", data=df, color = colors1[1])
plt.title("Boxplot distribution of Age of Aquisition", size = 18)
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
for i, x in zip(range(0,7), bin_centers):
    # Label the raw counts
    age_range=['0-2','2-4','4-6','6-8','8-10','10-12','13+']
    ax.annotate(age_range[i], xy=(x, 0), xycoords=('data', 'axes fraction'),
        xytext=(0, -22), textcoords='offset points',fontsize=16, va='top', ha='center')


aoa_boxplot.set_xlabel("Age of Aquisition", rotation = "horizontal", size = 16)



#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa1= df["aoa"].dropna(0, inplace = False)

data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(0,8),edgecolor='black')

# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)

# Label the raw counts and the percentages below the x-axis...
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
for i, x in zip(range(0,7), bin_centers):
    # Label the raw counts
    age_range=['0-2','2-4','4-6','6-8','8-10','10-12','13+']
    ax.annotate(age_range[i], xy=(x, 0), xycoords=('data', 'axes fraction'),
        xytext=(0, -22), textcoords='offset points',fontsize=16, va='top', ha='center')

    # Label the percentages
    #percent = '%0.0f%%' % (100 * float(count) / counts.sum())
    #ax.annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
        #xytext=(0, -32), textcoords='offset points', va='top', ha='center')


# Give ourselves some more room at the bottom of the plot
#plt.subplots_adjust(bottom=0.50)
plt.title("Distribution of aoa", size = 18)
plt.xlabel(xlabel = "Age of Aquisition", fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()

aoa_descr=df["aoa"].describe()

print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} &          aoa \\
\midrule
count &  4682.000000 \\
mean  &     4.143427 \\
std   &     1.252770 \\
min   &     1.219000 \\
25\%   &     3.114000 \\
50\%   &     4.177000 \\
75\%   &     5.152000 \\
max   &     6.971000 \\
\bottomrule
\end{tabular}

In [ ]:
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']

refvar="aoa"

#BOXPLOT

plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])

#bellurie

plt.title("Boxplot distribution of aoa", size = 18)
aoa_boxplot.set_xlabel("Age of Aquisition", rotation = "horizontal", size = 16)



#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa1= df[refvar].dropna(0, inplace = False)

data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,8),edgecolor='black')

# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)

# Label the raw counts and the percentages below the x-axis...
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
for i, x in zip(range(0,7), bin_centers):
    # Label the raw counts
    age_range=['0-2','2-4','4-6','6-8','8-10','10-12','13+']
    ax.annotate(age_range[i], xy=(x, 0), xycoords=('data', 'axes fraction'),
        xytext=(0, -22), textcoords='offset points',fontsize=16, va='top', ha='center')

    # Label the percentages
    #percent = '%0.0f%%' % (100 * float(count) / counts.sum())
    #ax.annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
        #xytext=(0, -32), textcoords='offset points', va='top', ha='center')


# Give ourselves some more room at the bottom of the plot
plt.subplots_adjust(bottom=0.50)

# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title("Distribution of aoa", size = 18)
plt.xlabel(xlabel = "Age of Aquisition", fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()

aoa_descr=df[refvar].describe()

print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} &          aoa \\
\midrule
count &  4682.000000 \\
mean  &     4.143427 \\
std   &     1.252770 \\
min   &     1.219000 \\
25\%   &     3.114000 \\
50\%   &     4.177000 \\
75\%   &     5.152000 \\
max   &     6.971000 \\
\bottomrule
\end{tabular}

In [ ]:

In [ ]:
aoa_descr=df["aoa"].describe()

print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} &          aoa \\
\midrule
count &  4682.000000 \\
mean  &     4.143427 \\
std   &     1.252770 \\
min   &     1.219000 \\
25\%   &     3.114000 \\
50\%   &     4.177000 \\
75\%   &     5.152000 \\
max   &     6.971000 \\
\bottomrule
\end{tabular}

There are 4682 occurrences for this variable. That means that there are no missing values, or NaN. The mean is 4.14, and the standard deviation is 1.25.

Size¶

In this dataset "size" is a measure of magnitude expressed in either concrete or abstarct terms (big, small). That is, if a word can be associated with adjectives like big or small (e.g. palace or mountain for concrete object, and knowledge or love for abstarct ideas).

In [ ]:
#calcolo frequenza assoluta per aoa
df["semsize"].value_counts()
Out[ ]:
4.000    52
5.000    42
3.000    35
4.971    27
4.677    26
         ..
5.241     1
5.891     1
4.160     1
6.281     1
5.867     1
Name: semsize, Length: 939, dtype: int64
In [ ]:
#imposto lo spazio
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
plt.rcParams["figure.figsize"] = [16, 4]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
plt.subplot(1, 2, 1)
#distribuzione e tendenze centrali: boxplot
aoa_boxplot = sb.boxplot( x = "semsize", data=df, color = colors1[1])
plt.title("Boxplot distribution of semantic size", size = 18)
aoa_boxplot.set_xlabel("semantic size", rotation = "horizontal", size = 16)
plt.subplot(1, 2, 2)

size1= df["semsize"].dropna(0, inplace = False)

#imposto la legenda (da rivedere)
#lables_size= [0-2, 2-4, 4-6, 6-8, 8-10, 10-12, 13]
#plt.legend(title="Age of aquisition ranges", title_fontsize = 14, bbox_to_anchor=(1, 1), labels=lables_size )
#distribuzione e tendenze centrali: istogramma
aoa_hist = plt.hist(size1,  color = colors1[1],edgecolor='black',bins=range(1,8))

#BELLURIE

plt.title("Distribution of semantic size", size = 18)
plt.xlabel(xlabel = "semantic size", fontsize = 16)
plt.ylabel(ylabel = "Degree", fontsize = 16)
sb.despine(right = True)

plt.show()
In [ ]:

In [ ]:

In [ ]:
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']

refvar="semsize"

#BOXPLOT

plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])

#bellurie

plt.title("Boxplot distribution of semantic size", size = 18)
aoa_boxplot.set_xlabel("Semantic Size", rotation = "horizontal", size = 16)



#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa1= df[refvar].dropna(0, inplace = False)

data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,8),edgecolor='black')

# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)

# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title("Distribution of semsize", size = 18)
plt.xlabel(xlabel = "Semantic Size", fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()

aoa_descr=df[refvar].describe()

print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} &      semsize \\
\midrule
count &  4682.000000 \\
mean  &     4.136403 \\
std   &     1.023293 \\
min   &     1.375000 \\
25\%   &     3.438000 \\
50\%   &     4.186500 \\
75\%   &     4.882000 \\
max   &     6.912000 \\
\bottomrule
\end{tabular}

In [ ]:
df["semsize"].describe()
Out[ ]:
count    4682.000000
mean        4.136403
std         1.023293
min         1.375000
25%         3.438000
50%         4.186500
75%         4.882000
max         6.912000
Name: semsize, dtype: float64

The count of occurrences for this variable suggests that there are no missing values. The mean is 4.13 and the standard deviation is just above 1. This results suggests that the tendency is that is more common to think that a word has a bigger semantic size.

Gender¶

Gender in this dataset refers to how strongly its meaning is associated with male or female behaviour or idea. This variable could be very interesting in regards of the social bias that might, or might not, be present.

In [ ]:
#calcolo frequenza assoluta per aoa
df["gender"].value_counts()
Out[ ]:
4.000    86
5.000    39
3.971    32
4.500    29
4.029    28
         ..
3.129     1
4.861     1
1.743     1
4.080     1
6.588     1
Name: gender, Length: 1022, dtype: int64
In [ ]:
#imposto lo spazio
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
plt.rcParams["figure.figsize"] = [16, 4]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
plt.subplot(1, 2, 1)
#distribuzione e tendenze centrali: boxplot
gender_boxplot = sb.boxplot( x = "gender", data=df, color = colors1[1])
plt.title("Boxplot distribution of perceived gender", size = 18)
gender_boxplot.set_xlabel("gender", rotation = "horizontal", size = 16)
plt.subplot(1, 2, 2)

gender1= df["gender"].dropna(0, inplace = False)

#imposto la legenda
lables_size= [0-2, 2-4, 4-6, 6-8, 8-10, 10-12, 13]
plt.legend(title="Perceived gender", title_fontsize = 14, bbox_to_anchor=(1, 1), labels=lables_size )
#distribuzione e tendenze centrali: istogramma
aoa_hist = plt.hist(gender1,  color = colors1[1])
plt.title("Distribution of perceived gender", size = 18)
plt.xlabel(xlabel = "gender", fontsize = 16)
plt.ylabel(ylabel = "Degree", fontsize = 16)
sb.despine(right = True)

plt.show()
In [ ]:
df["gender"].describe()
Out[ ]:
count    4682.000000
mean        4.099933
std         0.912293
min         1.000000
25%         3.606000
50%         4.121000
75%         4.656000
max         6.971000
Name: gender, dtype: float64
In [ ]:
df["length"].describe()
Out[ ]:
count    4682.000000
mean        6.348355
std         2.006230
min         2.000000
25%         5.000000
50%         6.000000
75%         8.000000
max        16.000000
Name: length, dtype: float64
In [ ]:
import matplotlib.ticker as mticker
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']

refvar="gender"

#BOXPLOT

plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])

#bellurie

plt.title("Boxplot distribution of gender", size = 18)
aoa_boxplot.set_xlabel("Gender", rotation = "horizontal", size = 16)



#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})

aoa1= df[refvar].dropna(0, inplace = False)

data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,8),edgecolor='black')

# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)

# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title("Distribution of gender", size = 18)
plt.xlabel(xlabel = "Gender", fontsize = 20, labelpad=20)


plt.ylabel(ylabel = "Occurences", fontsize = 16)


ax.xaxis.set_minor_locator(mticker.FixedLocator((1, 7)))
ax.xaxis.set_minor_formatter(mticker.FixedFormatter(("Label A", "Label B")))
plt.setp(ax.yaxis.get_minorticklabels(), size=15, va="center")
ax.tick_params("x",which="minor",pad=25, left=False)

sb.despine(right = True)
plt.show()

aoa_descr=df[refvar].describe()

print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} &       gender \\
\midrule
count &  4682.000000 \\
mean  &     4.099933 \\
std   &     0.912293 \\
min   &     1.000000 \\
25\%   &     3.606000 \\
50\%   &     4.121000 \\
75\%   &     4.656000 \\
max   &     6.971000 \\
\bottomrule
\end{tabular}

This variable is not self intuitive. There is no visible correlation from the number and the perceived gender of that word. A supposition is that the higher the value, the more "masculine" the word is perceived. For example the word "actor" has a perceived gender value of 5.588, where the max value is 6.971.

Also it seems that the distribution is heavly centered, leaving a significant amount of outliers, as seen in the figure above.

In [ ]:
df[["word","gender"]].loc[[51, 52]]
Out[ ]:
word gender
51 actor 5.588
52 actress 1.303

1.2 Distribution of variables and statistics¶

Codice per le distribuzioni delle variabili 'arousal', 'valence', 'dominance', 'concreteness','imageability', 'familiarity'

In [ ]:
df.corr() #matrice di correlazione

#distribuzione delle variabili
scaler = MinMaxScaler()

var = ['arousal', 'valence', 'dominance', 'concreteness',
       'imageability', 'familiarity']

for e in var:
    print(e)
    Y = df[[e]].values
    Y_minmax = scaler.fit_transform(Y)
    plt.hist(Y_minmax, edgecolor='white')
    plt.show()

#piechart polisemia

#percentuale delle polisemiche 
m = 0
p = 0

for e in df['polysemy'].values:
    if e == 1:
        p += 1
    if e == 0:
        m += 1

p1 = p/(p+m)*100
m1 = m/(p+m)*100
print(p1, m1)

#grafico
labels = 'Polisemic', 'Monosemic'
sizes = [p1, m1]
explode = (0.1, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

#matrice di correlazione
corr=df.corr()
plt.figure(figsize=(16, 6))
heatmap = sb.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12);
#sb.heatmap(corr, cmap="Blues", annot=True)
arousal
valence
dominance
concreteness
imageability
familiarity
8.094831268688594 91.9051687313114

1.2.1 Variable Normalization¶

1.3 Assessing Data Quality¶

1.3.1 Missing Values¶

In [ ]:
df.isnull().sum()
Out[ ]:
word                0
length              0
arousal             0
valence             0
dominance           0
concreteness        0
imageability        0
familiarity         0
aoa                 0
semsize             0
gender              0
polysemy            0
web_corpus_freq    14
dtype: int64

The dataset seems to be almost without null values. In fact, there are only 14 NaN, and all are concentrated in the "web_corpus_freq" variable.

In [ ]:
df[df['web_corpus_freq'].isnull()]
Out[ ]:
word length arousal valence dominance concreteness imageability familiarity aoa semsize gender polysemy web_corpus_freq
585 burgle 6 5.118 2.303 3.656 4.970 5.424 5.200 3.735 4.697 5.333 0 NaN
753 Christmas 9 7.516 7.914 5.600 5.086 6.571 6.710 1.600 6.394 3.771 0 NaN
1070 Dad 3 4.912 6.849 4.618 6.257 6.400 6.853 1.265 5.147 6.706 0 NaN
1076 Dame 4 4.194 5.594 5.469 5.125 4.969 3.697 4.969 4.548 1.242 0 NaN
1540 Facebook 8 4.971 4.857 4.486 5.943 6.229 6.829 6.314 5.114 4.171 0 NaN
1559 FALSE 5 4.636 2.941 4.206 3.455 2.765 5.700 3.086 4.500 4.353 0 NaN
2673 Mom 3 5.667 7.936 4.813 6.424 6.250 6.594 1.333 5.094 1.097 0 NaN
2724 Mum 3 4.594 7.938 4.219 6.091 6.625 6.906 1.219 5.061 1.212 0 NaN
2726 Mummy 5 5.364 7.471 4.879 5.794 6.515 6.182 1.771 4.677 1.455 0 NaN
3773 skijump 7 5.914 5.771 5.486 6.200 6.529 4.758 5.028 5.389 5.000 0 NaN
4347 TRUE 4 5.743 7.914 6.219 2.529 2.719 6.156 2.400 5.424 3.182 0 NaN
4365 TV 2 4.824 5.706 4.559 6.677 6.857 6.706 2.206 3.333 4.629 0 NaN
4373 Twitter 7 4.235 4.943 4.824 4.886 5.600 6.273 6.971 4.771 3.829 0 NaN
4668 yo-yo 5 5.059 5.800 5.636 6.455 6.424 4.484 2.800 1.875 4.206 0 NaN

Looking at the values that are present in the dataset but are not present in the web corpus, we can spot some similarities. There are 3 recurrences of the same word but slightly different from one another: "Mom", "Mum" and "Mommy". Also the word "Dad" is not present in the corpus.

In addition, there are 3 words written with full capital letters: "FALSE", "TRUE", and "TV". If the corpus is case sensitive, that could be an explaination of why those words are not present.

Finally, there are 2 words of social media that are absent in the corpus: "Facebook" and "Twitter". This is a bit strange since those 2 words are really known.

1.3.2 Outliers¶

In [ ]:
#creating a copy of the df
df2 = df.copy()
#dropping some variables for boxplot visualzation
df3 = df2.drop(["word","web_corpus_freq", "polysemy","length"], axis=1)
df3.head()
Out[ ]:
arousal valence dominance concreteness imageability familiarity aoa semsize gender
0 4.200 2.864 4.333 5.455 4.391 2.382 6.760 4.652 5.391
1 3.125 5.781 4.667 5.906 5.344 3.324 5.177 5.121 3.303
2 3.273 5.250 5.235 3.286 3.177 5.121 5.543 2.667 3.971
3 4.194 3.767 4.419 3.367 2.516 3.971 6.233 4.679 5.167
4 3.846 3.880 4.800 3.292 2.571 3.097 6.407 5.083 4.571
In [ ]:
#displaying all boxplot in one plot

#defining outliers as a blue underscore circle
blue_circle = dict(markerfacecolor='blue', marker='o', markeredgecolor='white')

#defining fig, axs as plt.subplots. In the arguments we pass in that we want 1 row, and then the number of columns. 
#The number of columns will be equal to the length of df.columns. 
#Finally, we will set the figsize to 20 by 10.
fig, axs = plt.subplots(1, len(df3.columns), figsize=(20,10))

#defining the shape and color of the mean
mean_shape = dict(markerfacecolor='green', marker='D', markeredgecolor='green')

#creating a small for loop which will loop over each of our plot axes. 
#Using the enumerate function here to keep track of i, our index value
for i, ax in enumerate(axs.flat):
    #addong in our boxplot by calling upon ax.boxplot() and passing in df.iloc, 
    #which allows us to get columns by the index variable, i. 
    #We do this by creating square brackets, with a colon followed by a comma. 
    #This tells the iloc function to get all rows, and then we pass in the column index. 
    #Next we specify our outlier properties and pass in red_circle.
    ax.boxplot(df3.iloc[:,i], flierprops=blue_circle, showmeans = True, meanprops =mean_shape, notch=True)
    ax.set_title(df3.columns[i], fontsize=20, fontweight='bold')
    ax.tick_params(axis='y', labelsize=14)
    
plt.tight_layout()

First, some variables had to be dropped to perform this analysys with boxplots. "word" is not a numerical variable, and for that reason is not used in this plot. "web_corpus-freq" and "polysemy" can be analised on their own.

The plot shows that some variables are well distributed, like "concretness" and "imageability", while others are not, like "length", "arousal" and "dominance". Later this issue will be addressed with more depth.

In [ ]:
df["polysemy"].plot(kind="box", vert = False)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8740c62590>

"polysemy" has only 2 possible values, 0 and 1, that are false and true. For that reason it doesn't have outliers.

In [ ]:
df["polysemy"].value_counts().plot(kind="pie")
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f87417ba110>

"web_corpus_freq" instead seems to have a lot of outliers, as shown below.

In [ ]:
df["web_corpus_freq"].dropna(0, inplace = False).plot(kind="box", vert = False, logx=True)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8736941e90>

1.3.3 Errors¶

In [ ]:

In [ ]:
var = ['arousal', 'valence', 'dominance', 'concreteness',
       'imageability', 'familiarity','semsize','gender','aoa']

for element in var:
  for e in df[element].values:
    if(type(e)!=np.float64):
       print(type(e))

  print("tutto ok fra")
tutto ok fra
tutto ok fra
tutto ok fra
tutto ok fra
tutto ok fra
tutto ok fra
tutto ok fra
tutto ok fra
tutto ok fra
In [ ]:
#creating a copy of the df
df_errors = df.copy()
In [ ]:
df_errors[["word", "length"]]
Out[ ]:
word length
0 abattoir 8
1 abbey 5
2 abbreviate 10
3 abdicate 8
4 abdication 10
... ... ...
4677 zeppelin 8
4678 zero 4
4679 zest 4
4680 zoo 3
4681 zoology 7

4682 rows × 2 columns

In [ ]:
df_errors["word"] = df_errors.word.str.len()
In [ ]:
df_errors[["word", "length"]]
Out[ ]:
word length
0 8 8
1 5 5
2 10 10
3 8 8
4 10 10
... ... ...
4677 8 8
4678 4 4
4679 4 4
4680 3 3
4681 7 7

4682 rows × 2 columns

In [ ]:
df_errors["word"].equals(df_errors["length"])
Out[ ]:
True

All values of the variable 'word' are correct in regards of their length

1.3.4 Semantic Inconsistencies¶

In [ ]:
df.loc[df['polysemy']==1]
Out[ ]:
word length arousal valence dominance concreteness imageability familiarity aoa semsize gender polysemy web_corpus_freq
62 address 7 3.206 5.382 5.364 5.059 4.343 5.839 3.371 3.912 4.000 1 261872866.0
107 aim 3 4.794 6.606 6.829 2.677 2.941 5.909 3.857 4.206 4.265 1 28951240.0
188 apple 5 4.677 6.147 5.485 6.824 6.909 6.719 1.529 2.441 3.424 1 50551171.0
211 arm 3 3.735 5.471 5.774 6.727 6.571 6.546 1.457 3.171 4.265 1 23724057.0
216 arms 4 4.364 5.546 5.636 6.647 6.485 6.469 2.206 3.758 4.912 1 27432921.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
4630 wood 4 3.000 5.206 5.265 6.727 6.606 6.303 2.629 3.727 5.235 1 51130555.0
4647 wound 5 5.156 2.706 3.667 5.758 5.853 5.500 4.118 3.706 5.118 1 5820050.0
4661 yak 3 3.552 4.807 4.700 5.219 4.742 2.829 4.897 3.833 4.710 1 1056994.0
4662 yard 4 2.788 5.171 4.771 5.829 5.343 4.177 3.771 4.286 4.457 1 15075593.0
4663 yarn 4 2.750 5.273 5.219 5.818 5.333 3.813 4.303 2.394 2.813 1 5113505.0

379 rows × 13 columns

1.4 Eventual other variable trasformations¶

1.5 Pairwise correlations and eventual elimination of variables¶

An overview of the relation between the nine variables is provided in Fig. 6. Where a correlation greater than |0.6| is found, we plotted the values of the two variables for a better visualization (fig. 7). There is a strong correlation (0.91) between concreteness and imageability: it is difficult to imagine an abstract word and easier to imagine a concrete one. Moreover, concreteness and imageability relate to the other variables similarly, with a margin of +-0.14. Therefore we merged them into a new variable, perceivability. The values of perceivabilty are the mean of concreteness and imageability values. Other positively correlated variables are valence and dominance, with 0.72: the more valuable an item is perceived, the higher the degree of control over the object. Familiary and age of acquisition are instead negatively related: from the pairplot (Fig \ref{fig:pairplot}) is apparent that every word acquired in early age is highly familiar.

In [ ]:
#matrice di correlazione
corr=df.corr()
plt.figure(figsize=(16, 6))
heatmap = sb.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12);
#sb.heatmap(corr, cmap="Blues", annot=True)
In [ ]:
#creating a copy of the df
dfpp = df.copy()
#dropping some variables for boxplot visualzation
dfpp1 = dfpp.drop(["word", "polysemy","length", 'gender', 'web_corpus_freq', 'semsize', 'arousal'], axis=1)
In [ ]:
sb.pairplot(dfpp1, 
            plot_kws=dict(marker=".", linewidth=1),
            diag_kws=dict(fill=False),
            corner=False
           )
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x7f8736531950>
In [ ]:
sb.pairplot(dfpp1, 
            x_vars=['concreteness'],
            y_vars=['imageability'],
            plot_kws=dict(marker=".", linewidth=1),
            diag_kws=dict(fill=False),
            corner=False
           )

#plt.savefig('pp_imageability_concreteness.png', dpi=300)
#files.download('pp_imageability_concreteness.png')
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x7f87358cfe50>
In [ ]:
pp_dv = sb.pairplot(dfpp1, 
            x_vars=['valence'],
            y_vars=['dominance'],
            plot_kws=dict(marker=".", linewidth=1),
            diag_kws=dict(fill=False),
            corner=False
           )

#plt.savefig('pp_dominance_valence.png', dpi=300)
#files.download('pp_dominance_valence.png')
In [ ]:
sb.pairplot(dfpp1, 
            x_vars=['familiarity'],
            y_vars=['aoa'],
            plot_kws=dict(marker=".", linewidth=1),
            diag_kws=dict(fill=False),
            corner=False
           )

#plt.savefig('pp_aoa_familiarity.png', dpi=300)
#files.download('pp_aoa_familiarity.png')
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x7f87358ff910>
In [ ]:
df2["perceivability"] = df2[["imageability", "concreteness"]].mean(axis=1)
df2.head()
Out[ ]:
word length arousal valence dominance concreteness imageability familiarity aoa semsize gender polysemy web_corpus_freq perceivability
0 abattoir 8 4.200 2.864 4.333 5.455 4.391 2.382 6.760 4.652 5.391 0 160074.0 4.9230
1 abbey 5 3.125 5.781 4.667 5.906 5.344 3.324 5.177 5.121 3.303 0 4224864.0 5.6250
2 abbreviate 10 3.273 5.250 5.235 3.286 3.177 5.121 5.543 2.667 3.971 0 140105.0 3.2315
3 abdicate 8 4.194 3.767 4.419 3.367 2.516 3.971 6.233 4.679 5.167 0 124123.0 2.9415
4 abdication 10 3.846 3.880 4.800 3.292 2.571 3.097 6.407 5.083 4.571 0 128143.0 2.9315
In [ ]:
df_perc=df2.drop(["concreteness","imageability"], axis=1)
df_perc.head()
Out[ ]:
word length arousal valence dominance familiarity aoa semsize gender polysemy web_corpus_freq perceivability
0 abattoir 8 4.200 2.864 4.333 2.382 6.760 4.652 5.391 0 160074.0 4.9230
1 abbey 5 3.125 5.781 4.667 3.324 5.177 5.121 3.303 0 4224864.0 5.6250
2 abbreviate 10 3.273 5.250 5.235 5.121 5.543 2.667 3.971 0 140105.0 3.2315
3 abdicate 8 4.194 3.767 4.419 3.971 6.233 4.679 5.167 0 124123.0 2.9415
4 abdication 10 3.846 3.880 4.800 3.097 6.407 5.083 4.571 0 128143.0 2.9315

2 Clustering¶

2.1 Preprocessing¶

In [ ]:
dfprepro= df_perc.copy()
dfprepro=dfprepro.rename(columns={"gender": "masculinity"})
In [ ]:
dfprepro.loc[(dfprepro['web_corpus_freq'].isnull() == True), 'web_corpus_freq'] = dfprepro['web_corpus_freq'].mean()
In [ ]:
dfprepro[dfprepro['web_corpus_freq'].isnull()]
Out[ ]:
word length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
In [ ]:
dfprepro["web_corpus_log"] = pd.qcut(dfprepro["web_corpus_freq"], 10) #taglio la variabile web_corpus_freq in 10 gruppi
print(dfprepro[["web_corpus_log", "web_corpus_freq"]].groupby(["web_corpus_log"], as_index=False).mean())
               web_corpus_log  web_corpus_freq
0       (12769.999, 575889.4]     3.097333e+05
1       (575889.4, 1242854.6]     9.041106e+05
2      (1242854.6, 2214291.0]     1.686212e+06
3      (2214291.0, 3597973.4]     2.849525e+06
4      (3597973.4, 5702981.5]     4.637392e+06
5      (5702981.5, 9168887.0]     7.342067e+06
6     (9168887.0, 16042883.5]     1.222427e+07
7    (16042883.5, 31729949.6]     2.273650e+07
8    (31729949.6, 69227170.1]     4.752152e+07
9  (69227170.1, 2022459848.0]     1.985905e+08
In [ ]:
dataframe = [dfprepro]
In [ ]:
for dataset in dataframe:
    dataset.loc[(dataset["web_corpus_freq"] > 10000) & (dataset["web_corpus_freq"] <= 100000), "web_corpus_freq"] = 4
    dataset.loc[(dataset["web_corpus_freq"] > 100000) & (dataset["web_corpus_freq"] <= 1000000), "web_corpus_freq"] = 5
    dataset.loc[(dataset["web_corpus_freq"] > 1000000) & (dataset["web_corpus_freq"] <= 10000000), "web_corpus_freq"] = 6
    dataset.loc[(dataset["web_corpus_freq"] > 10000000) & (dataset["web_corpus_freq"] <= 100000000), "web_corpus_freq"] = 7
    dataset.loc[(dataset["web_corpus_freq"] > 100000000) & (dataset["web_corpus_freq"] <= 1000000000), "web_corpus_freq"] = 8
    dataset.loc[dataset["web_corpus_freq"] > 1000000000, "web_corpus_freq"] = 9
    
dfprepro.head()
Out[ ]:
word length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability web_corpus_log
0 abattoir 8 4.200 2.864 4.333 2.382 6.760 4.652 5.391 0 5.0 4.9230 (12769.999, 575889.4]
1 abbey 5 3.125 5.781 4.667 3.324 5.177 5.121 3.303 0 6.0 5.6250 (3597973.4, 5702981.5]
2 abbreviate 10 3.273 5.250 5.235 5.121 5.543 2.667 3.971 0 5.0 3.2315 (12769.999, 575889.4]
3 abdicate 8 4.194 3.767 4.419 3.971 6.233 4.679 5.167 0 5.0 2.9415 (12769.999, 575889.4]
4 abdication 10 3.846 3.880 4.800 3.097 6.407 5.083 4.571 0 5.0 2.9315 (12769.999, 575889.4]
In [ ]:
dfprepro["web_corpus_freq"].plot(kind="hist")
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8735866510>
In [ ]:
dfprepro['web_corpus_freq'].describe()
Out[ ]:
count    4682.000000
mean        6.285135
std         0.843987
min         4.000000
25%         6.000000
50%         6.000000
75%         7.000000
max         9.000000
Name: web_corpus_freq, dtype: float64
In [ ]:
dfprepro['web_corpus_log'].describe()
Out[ ]:
count                           4682
unique                            10
top       (68973807.5, 2022459848.0]
freq                             469
Name: web_corpus_log, dtype: object
In [ ]:
dfprepro = dfprepro.drop(["web_corpus_log","word"], axis=1)
#dfprepro.loc[(dfprepro['web_corpus_freq'].isnull() == True), 'web_corpus_freq'] = dfprepro['web_corpus_freq'].mean()
dfprepro.isnull().sum()
dfprepro.head()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
0 8 4.200 2.864 4.333 2.382 6.760 4.652 5.391 0 5.0 4.9230
1 5 3.125 5.781 4.667 3.324 5.177 5.121 3.303 0 6.0 5.6250
2 10 3.273 5.250 5.235 5.121 5.543 2.667 3.971 0 5.0 3.2315
3 8 4.194 3.767 4.419 3.971 6.233 4.679 5.167 0 5.0 2.9415
4 10 3.846 3.880 4.800 3.097 6.407 5.083 4.571 0 5.0 2.9315
In [ ]:
pca = PCA(n_components=2)
pca.fit(dfprepro)
PCA_df = pd.DataFrame(pca.transform(dfprepro))
pca.transform(dfprepro)
score_pca = pca.transform(dfprepro)
In [ ]:

In [ ]:
scaler = MinMaxScaler()

Y_pca=PCA_df.values

Y_pca_minmax = scaler.fit_transform(Y_pca)

scaled_dfprepro = pd.DataFrame(data = Y_pca_minmax, columns = PCA_df.columns)

scaled_dfprepro.head()

X=dfprepro.values

X_minmax = scaler.fit_transform(X)

df_xminmax = pd.DataFrame(data = X_minmax, columns = dfprepro.columns)
In [ ]:
scaled_dfprepro.head()
Out[ ]:
0 1
0 0.567559 0.141222
1 0.313091 0.396255
2 0.603087 0.434351
3 0.573763 0.250204
4 0.696536 0.292227
In [ ]:
df_xminmax['word']=df_perc['word']
In [ ]:
cols = df_xminmax.columns.tolist()
cols = cols[-1:] + cols[:-1]

df_xminmax=df_xminmax[cols]


df_xminmax.head()
Out[ ]:
word length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
0 abattoir 0.428571 0.350163 0.240777 0.372006 0.138889 0.963317 0.591837 0.735388 0.0 0.2 0.609099
1 abbey 0.214286 0.174510 0.623736 0.423950 0.316893 0.688108 0.676540 0.385698 0.0 0.4 0.746168
2 abbreviate 0.571429 0.198693 0.554024 0.512286 0.656463 0.751739 0.233339 0.497572 0.0 0.2 0.278825
3 abdicate 0.428571 0.349183 0.359328 0.385381 0.439153 0.871697 0.596713 0.697873 0.0 0.2 0.222201
4 abdication 0.571429 0.292320 0.374163 0.444635 0.273998 0.901947 0.669677 0.598057 0.0 0.2 0.220248
In [ ]:
df_xminmax=df_xminmax.drop(['word'],axis=1)

df_xminmax.describe()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
count 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000
mean 0.310597 0.428289 0.532598 0.482728 0.684871 0.508419 0.498718 0.519165 0.080948 0.457027 0.554749
std 0.143302 0.179275 0.209314 0.144739 0.174077 0.217797 0.184810 0.152787 0.272785 0.168797 0.266786
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.214286 0.292810 0.405015 0.402488 0.578042 0.329451 0.372584 0.436443 0.000000 0.400000 0.313092
50% 0.285714 0.410784 0.559275 0.494868 0.716364 0.514256 0.507766 0.522693 0.000000 0.400000 0.538709
75% 0.428571 0.549346 0.664041 0.569051 0.816704 0.683762 0.633375 0.612293 0.000000 0.600000 0.813629
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [ ]:
PCA_df
Out[ ]:
0 1
0 3.120256 -3.279911
1 -0.691649 -0.663581
2 3.652448 -0.272761
3 3.213184 -2.161888
4 5.052306 -1.730781
... ... ...
4677 2.402470 -0.047095
4678 -2.805922 -1.236960
4679 -1.332992 0.901015
4680 -3.880212 1.497711
4681 1.500057 0.195502

4682 rows × 2 columns

In [ ]:
x = PCA_df[0]
y = PCA_df[1]
#z = PCA_df[2]
print(type (x))
print(type (y))
#print(type (z))
print(type (Y_pca_minmax))
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
In [ ]:
print (Y_pca_minmax)
[[0.5675594  0.14122187]
 [0.31309066 0.39625457]
 [0.60308653 0.43435069]
 ...
 [0.270277   0.54876721]
 [0.10023392 0.60693153]
 [0.45940086 0.47999567]]

2.2 Clustering analysis by K-Means¶

In [ ]:
kmeans = KMeans( n_clusters=3, n_init=10, max_iter=100 )
kmeans.fit(Y_pca_minmax)
Out[ ]:
KMeans(max_iter=100, n_clusters=3)
In [ ]:
kmeans.labels_
Out[ ]:
array([2, 0, 2, ..., 0, 0, 1], dtype=int32)
In [ ]:
print('SSE', kmeans.inertia_)
print('Silhouette', silhouette_score(Y_pca_minmax, kmeans.labels_))
SSE 107.1169618212198
Silhouette 0.36741782484823465
In [ ]:
print('Calinski-Harabasz', metrics.calinski_harabasz_score(Y_pca_minmax, kmeans.labels_))
Calinski-Harabasz 3575.890305688165
In [ ]:
silhouette_list = []

#for k in range(2, 50):
#    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100)
#    kmeans.fit(Y_pca_minmax)
#    silhouette_list.append( silhouette_score(Y_pca_minmax, kmeans.labels_))
In [ ]:
#plt.plot(range(2, len(silhouette_list)+2), silhouette_list, marker='*')
#plt.ylabel('Silhouette score')
#plt.show()
In [ ]:
kmeans = KMeans(n_clusters=3, n_init=10, max_iter=100)
kmeans.fit(Y_pca_minmax)
Out[ ]:
KMeans(max_iter=100, n_clusters=3)
In [ ]:
centers = kmeans.cluster_centers_
In [ ]:
kmeans.labels_
Out[ ]:
array([2, 1, 2, ..., 1, 1, 0], dtype=int32)
In [ ]:
from matplotlib import cm
plt.scatter( Y_pca[:,0], Y_pca[:,1], s=40, c= kmeans.labels_ , cmap = cm.tab20c)

plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

plt.show()
In [ ]:
plt.scatter( Y_pca_minmax[:,0], Y_pca_minmax[:,1], s=40, c= kmeans.labels_ , cmap = cm.tab20c)
plt.scatter( centers[:,0], centers[:,1], c='red', marker='*', s=200 )

plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

plt.show()
In [ ]:
scaled_dfprepro['cluster'] = kmeans.labels_

df_xminmax['cluster'] = kmeans.labels_

df_xminmax['cluster']=df_xminmax['cluster'].map({0:'1', 1:'2', 2:'3'})

df_xminmax_plot=df_xminmax.drop(["polysemy"], axis=1)

plt.figure(figsize = (8,8))
pie = scaled_dfprepro.groupby(['cluster']).size().to_frame().reset_index()
pie.rename(columns={0: 'count'}, inplace=True)

pie_labels = ['first cluster', 'second cluster', 'third cluster']
plt.pie(pie['count'], labels=pie_labels)
plt.show()
sb.set(font_scale=3.7)

fig, axs = plt.subplots(ncols=2, nrows=6, figsize=(30,60))
for cols, x in zip(df_xminmax_plot.columns[:5], range(5)):
    for y in range(1):
        g1=sb.boxplot(y=cols, x='cluster', data=df_xminmax_plot, order=['1', '2','3'], ax=axs[x,y], showfliers = True )
        g1.set(xlabel=None)

for cols, x in zip(df_xminmax_plot.columns[5:], range(5)):
    for y in range(1,2):
        g2=sb.boxplot(y=cols, x='cluster', data=df_xminmax_plot, order=['1', '2','3'], ax=axs[x,y], showfliers = True )
        g2.set(xlabel=None)
In [ ]:
df_perc.isnull().sum()

df_xminmax_plot['word']=df_perc['word']
df_xminmax_plot.loc[df_xminmax['cluster']=='First']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word
In [ ]:
df_xminmax_plot.loc[df_xminmax['cluster']=='Second']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word
In [ ]:
df_xminmax_plot.loc[df_xminmax['cluster']=='Third']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word

2.3 Analysis by density-based clustering¶

In [ ]:
from sklearn.neighbors import NearestNeighbors

neighbors = NearestNeighbors(n_neighbors=20)
neighbors_fit = neighbors.fit(Y_pca_minmax)
distances, indices = neighbors_fit.kneighbors(Y_pca_minmax)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
Out[ ]:
[<matplotlib.lines.Line2D at 0x7f8731efc310>]
In [ ]:
fig = plt.figure(figsize=(20, 12))
fig.subplots_adjust(hspace=.5, wspace=.2)
i = 1

for x in range(0, 15, 1):
    #y=-2+x/10
    eps = 0.012+1*x/1000
    db = DBSCAN(eps=eps, min_samples=20).fit(Y_pca_minmax)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    
    #print(eps)
    ax = fig.add_subplot(3, 5, i)
    ax.title.set_text("eps = {}".format(round(eps, 3)))
    #ax.text(1, 4, "eps = {}".format(round(eps, 3)), fontsize=25, ha="center")
    sb.scatterplot(Y_pca_minmax[:,0], Y_pca_minmax[:,1], hue=["{} cluster".format(x) for x in labels],legend=False,palette='tab20c',markers='.',size=1)
    ax.text(0.5,-0.3, "{} cluster(s)".format(max(labels)+1), size=12, ha="center", transform=ax.transAxes)


    i += 1
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
In [ ]:
dbscan = DBSCAN( eps=0.018, min_samples=20)
dbscan.fit(Y_pca_minmax)
Out[ ]:
DBSCAN(eps=0.018, min_samples=20)
In [ ]:
dbscan.labels_
Out[ ]:
array([-1,  0, -1, ...,  0, -1,  5])
In [ ]:
unique_labels, unique_counts = np.unique(dbscan.labels_, return_counts=True)
print(max(unique_labels))
10
In [ ]:
#cols = [col for col in PCA_df.columns if col != 'class']

plt.scatter( Y_pca_minmax[:,0], Y_pca_minmax[:,1], s=25, c= dbscan.labels_, cmap = cm.tab20 )

plt.xticks(fontsize=2)
plt.yticks(fontsize=2)

plt.show()
In [ ]:
scaled_dfprepro['cluster'] = dbscan.labels_

df_xminmax['cluster'] = dbscan.labels_

df_xminmax['cluster']=df_xminmax['cluster'].map({0:'First', 1:'Second', 2:'Third',3:'Fourth', 4:'Fifth', 5:'Sixth',6:'Seventh', 7:'Eighth', 8:'Nineth',9:'Tenth', 10:'Eleventh', -1:'Noise'})
In [ ]:
scaled_dfprepro['cluster'] = dbscan.labels_

df_xminmax['cluster'] = dbscan.labels_

df_xminmax['cluster']=df_xminmax['cluster'].map({-1:'0',0:'1', 1:'2', 2:'3',3:'4',4:'5', 5:'6',6:'7', 7:'8', 8:'9',9:'10', 10:'11'})

df_xminmax_plot=df_xminmax.drop(["polysemy"], axis=1)

df_xminmax_plot = df_xminmax_plot.loc[df_xminmax["cluster"] != -1]


plt.figure(figsize = (8,8))


pie = df_xminmax_plot.groupby(['cluster']).size().to_frame().reset_index()
pie.rename(columns={0: 'count'}, inplace=True)

pie_labels = ['noise','first cluster', 'second cluster', 'third cluster','fourth cluster','fifth cluster','sixth cluster','seventh cluster','eighth cluster','nineth cluster','tenth cluster','eleventh cluster']
plt.pie(pie['count'], labels=pie_labels)
plt.show()

fig, axs = plt.subplots(ncols=2, nrows=6, figsize=(15,30))
for cols, x in zip(df_xminmax_plot.columns[:5], range(5)):
    for y in range(1):
        sb.boxplot(y=cols, x='cluster', data=df_xminmax_plot, order=['1', '2','3','4','5','6','7','8','9','10','11'], ax=axs[x,y], showfliers = True )

for cols, x in zip(df_xminmax_plot.columns[5:], range(5)):
    for y in range(1,2):
        sb.boxplot(y=cols, x='cluster', data=df_xminmax_plot, order=['1', '2','3','4','5','6','7','8','9','10','11'], ax=axs[x,y], showfliers = True )
In [ ]:
print('Silhouette', silhouette_score(Y_pca_minmax, dbscan.labels_))
Silhouette -0.34242476647615017
In [ ]:
print('Calinski-Harabasz', metrics.calinski_harabasz_score(Y_pca_minmax, dbscan.labels_))
Calinski-Harabasz 59.9496266793775
In [ ]:
df_perc.isnull().sum()

df_xminmax_plot['word']=df_perc['word']
df_xminmax_plot.loc[df_xminmax['cluster']=='First']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word
In [ ]:
df_xminmax_plot.loc[df_xminmax['cluster']=='Second']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word
In [ ]:
df_xminmax_plot.loc[df_xminmax['cluster']=='Third']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word
In [ ]:
df_xminmax_plot.loc[df_xminmax['cluster']=='Fourth']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word
In [ ]:
df_xminmax_plot.loc[df_xminmax['cluster']=='Fifth']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word

2.4 Analysis by hierarchical clustering¶

In [ ]:
data_dist = pdist(Y_pca_minmax, metric='euclidean')
data_link = linkage(data_dist, method='complete')

res = dendrogram(data_link, truncate_mode='lastp')
In [ ]:
hier = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='complete')
hier.fit(Y_pca_minmax)
hier.labels_
hier.n_clusters_
#hier.n_leaves_

plt.scatter( Y_pca_minmax[:,0], Y_pca_minmax[:,1],s=50, c= hier.labels_, marker='.', cmap = cm.tab20)

plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

plt.show()
In [ ]:
#Store the labels
labels = hier.labels_

#Then get the frequency count of the non-negative labels
counts = np.bincount(labels[labels>=0])

print (counts)
[2205  310  355  393  732  687]
In [ ]:
print('Silhouette', silhouette_score(Y_pca_minmax, hier.labels_))
Silhouette 0.21403647105170687
In [ ]:
print('Calinski-Harabasz', metrics.calinski_harabasz_score(Y_pca_minmax, hier.labels_))
Calinski-Harabasz 2074.84678178521
In [ ]:
scaled_dfprepro['cluster'] = hier.labels_

df_xminmax['cluster'] = hier.labels_

df_xminmax['cluster']=df_xminmax['cluster'].map({0:'1', 1:'2', 2:'3',3:'4', 4:'5', 5:'6'})

df_xminmax_plot=df_xminmax.drop(["polysemy"], axis=1)

plt.figure(figsize = (8,8))
pie = scaled_dfprepro.groupby(['cluster']).size().to_frame().reset_index()
pie.rename(columns={0: 'count'}, inplace=True)

pie_labels = ['first cluster', 'second cluster', 'third cluster','fourth cluster', 'fifth cluster', 'sixth cluster']
plt.pie(pie['count'], labels=pie_labels)
plt.show()
sb.set(font_scale=3.7)


fig, axs = plt.subplots(ncols=2, nrows=6, figsize=(30,60))
for cols, x in zip(df_xminmax_plot.columns[:5], range(5)):
    for y in range(1):
        g1=sb.boxplot(y=cols, x='cluster', data=df_xminmax_plot, order=['1', '2','3','4', '5','6'], ax=axs[x,y], showfliers = True )
        g1.set(xlabel=None)


for cols, x in zip(df_xminmax_plot.columns[5:], range(5)):
    for y in range(1,2):
        g2=sb.boxplot(y=cols, x='cluster', data=df_xminmax_plot, order=['1', '2','3','4', '5','6'], ax=axs[x,y], showfliers = True )
        g2.set(xlabel=None)
In [ ]:
df_perc.isnull().sum()

df_xminmax_plot['word']=df_perc['word']
In [ ]:
df_xminmax_plot.loc[df_xminmax['cluster']=='First']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word
In [ ]:
df_xminmax_plot.loc[df_xminmax['cluster']=='Second']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word
In [ ]:
df_xminmax_plot.loc[df_xminmax['cluster']=='Third']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word
In [ ]:
df_xminmax_plot.loc[df_xminmax['cluster']=='Fourth']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word
In [ ]:
df_xminmax_plot.loc[df_xminmax['cluster']=='Fifth']
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability cluster word
In [ ]:
# non serve ora 
#hier = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
#hier.fit(Y_pca_minmax)
#hier.labels_


#fig = plt.figure()
#ax = fig.add_subplot(111, projection='3d')

#ax.scatter(Y_pca_minmax[:,0], Y_pca_minmax[:,1], Y_pca_minmax[:,2], s=5, c= hier.labels_, marker='.')


#ax.plot3D(Yemo[:,1], Yemo[:,2], Yemo[:,3], c= hier.labels_, marker='.')

#plt.scatter( Yemo[:,1], Yemo[:,2], s=50, c= hier.labels_, marker='.' )



#plt.show()

2.5 Final Discussion¶

In [ ]:

3 Classification¶

3.1 Classification by Decision Trees¶

In [ ]:
# per il decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# visualizzarlo
from sklearn import tree
import pydotplus 
from IPython.display import Image 

# evaluazione
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

# hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# cross-validation
from sklearn.model_selection import cross_val_score
In [ ]:
df_class= dfprepro.copy()
df_class.head()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
0 8 4.200 2.864 4.333 2.382 6.760 4.652 5.391 0 5.0 4.9230
1 5 3.125 5.781 4.667 3.324 5.177 5.121 3.303 0 6.0 5.6250
2 10 3.273 5.250 5.235 5.121 5.543 2.667 3.971 0 5.0 3.2315
3 8 4.194 3.767 4.419 3.971 6.233 4.679 5.167 0 5.0 2.9415
4 10 3.846 3.880 4.800 3.097 6.407 5.083 4.571 0 5.0 2.9315
In [ ]:

In [ ]:
attributes = [col for col in df_class.columns if col != 'polysemy']
X = df_class[attributes].values
y = df_class['polysemy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=100)

len(df_class), X_train.shape[0], X_test.shape[0]
Out[ ]:
(4682, 3277, 1405)
In [ ]:
X_train.shape, X_test.shape
Out[ ]:
((3277, 10), (1405, 10))
In [ ]:
clf = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1)
clf.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier()
In [ ]:
for col, imp in zip(attributes, clf.feature_importances_): print(col, imp)
length 0.07921529222712845
arousal 0.10707368272885229
valence 0.0955854527681356
dominance 0.12345386186225976
familiarity 0.1131051588586401
aoa 0.10922341363043479
semsize 0.10343533635767627
masculinity 0.10334263776768145
web_corpus_freq 0.024459922367955262
perceivability 0.1411052414312361
In [ ]:
dot_data = tree.export_graphviz (clf, out_file=None,
                                feature_names=attributes,
                                class_names=[str(v) for v in clf.classes_],
                                filled=True, rounded=True,
                                special_characters=True,
                                max_depth=2)

graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
Out[ ]:
In [ ]:
# apply decision tree to train set
y_pred = clf.predict(X_train)
y_pred[:5]
Out[ ]:
array([0, 1, 0, 0, 0])
In [ ]:
y_train.values[:5]
Out[ ]:
array([0, 1, 0, 0, 0])
In [ ]:
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average=None))
Accuracy 1.0
F1 [1. 1.]
In [ ]:
print( classification_report(y_train, y_pred) )
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3012
           1       1.00      1.00      1.00       265

    accuracy                           1.00      3277
   macro avg       1.00      1.00      1.00      3277
weighted avg       1.00      1.00      1.00      3277

In [ ]:
# Confusion matrix for trainset
# TP, FN, FP, TN
confusion_matrix(y_train, y_pred)
Out[ ]:
array([[3012,    0],
       [   0,  265]])
In [ ]:
# apply decision tree to test set
y_pred = clf.predict(X_test)
In [ ]:
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)
Accuracy 0.8597864768683274
F1-score [0.92325672 0.18930041]
              precision    recall  f1-score   support

           0       0.93      0.92      0.92      1291
           1       0.18      0.20      0.19       114

    accuracy                           0.86      1405
   macro avg       0.55      0.56      0.56      1405
weighted avg       0.87      0.86      0.86      1405

Out[ ]:
array([[1185,  106],
       [  91,   23]])
In [ ]:
y_score = clf.predict_proba(X_test)
In [ ]:
fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)
0.5598237460420998
In [ ]:
plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

Preprocess¶

In [ ]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
In [ ]:
dfprepro.head()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
0 8 4.200 2.864 4.333 2.382 6.760 4.652 5.391 0 5.0 4.9230
1 5 3.125 5.781 4.667 3.324 5.177 5.121 3.303 0 6.0 5.6250
2 10 3.273 5.250 5.235 5.121 5.543 2.667 3.971 0 5.0 3.2315
3 8 4.194 3.767 4.419 3.971 6.233 4.679 5.167 0 5.0 2.9415
4 10 3.846 3.880 4.800 3.097 6.407 5.083 4.571 0 5.0 2.9315
In [ ]:
df_class_ref = dfprepro.copy()
#dataframe = [df_class_ref]
#for dataset in dataframe:
#    dataset.loc[(dataset["aoa"] > 1) & (dataset["aoa"] <= 2), "aoa"] = 1
#    dataset.loc[(dataset["aoa"] > 2)& (dataset["aoa"] <= 3), "aoa"] = 2
#    dataset.loc[(dataset["aoa"] > 3)& (dataset["aoa"] <= 4), "aoa"] = 3
#    dataset.loc[(dataset["aoa"] > 4)& (dataset["aoa"] <= 5), "aoa"] = 4
#    dataset.loc[(dataset["aoa"] > 5)& (dataset["aoa"] <= 6), "aoa"] = 5
#    dataset.loc[(dataset["aoa"] > 6)&( dataset["aoa"] <= 7), "aoa"] = 6
#    dataset.loc[(dataset["aoa"] > 7), "aoa"] = 7
#df_class_ref.head()
In [ ]:
var_to_scale=['aoa',"arousal","valence","dominance","familiarity","semsize","masculinity","perceivability"]

features = df_class_ref[var_to_scale]
scaler = MinMaxScaler().fit(features.values)
features = scaler.transform(features.values)

df_class_ref[var_to_scale] = features
df_class_ref.head()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
0 8 0.350163 0.240777 0.372006 0.138889 0.963317 0.591837 0.735388 0 5.0 0.609099
1 5 0.174510 0.623736 0.423950 0.316893 0.688108 0.676540 0.385698 0 6.0 0.746168
2 10 0.198693 0.554024 0.512286 0.656463 0.751739 0.233339 0.497572 0 5.0 0.278825
3 8 0.349183 0.359328 0.385381 0.439153 0.871697 0.596713 0.697873 0 5.0 0.222201
4 10 0.292320 0.374163 0.444635 0.273998 0.901947 0.669677 0.598057 0 5.0 0.220248

Arousal¶

In [ ]:
refvar="arousal"
taglio=0.55

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)
In [ ]:
plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt,
          filled=True,
          rounded=True,
          class_names=["not aroused","aroused"],
          feature_names=X.columns)
Out[ ]:
[Text(534.7704919477663, 399.546, 'valence <= 0.698\ngini = 0.372\nsamples = 3511\nvalue = [2645, 866]\nclass = not aroused'),
 Text(355.33135788018956, 383.238, 'semsize <= 0.638\ngini = 0.25\nsamples = 2829\nvalue = [2414, 415]\nclass = not aroused'),
 Text(185.56261986687727, 366.93, 'semsize <= 0.496\ngini = 0.164\nsamples = 2299\nvalue = [2092, 207]\nclass = not aroused'),
 Text(95.9751029444946, 350.62199999999996, 'masculinity <= 0.369\ngini = 0.101\nsamples = 1542\nvalue = [1460, 82]\nclass = not aroused'),
 Text(29.20938628158845, 334.31399999999996, 'semsize <= 0.366\ngini = 0.25\nsamples = 171\nvalue = [146, 25]\nclass = not aroused'),
 Text(16.115523465703973, 318.006, 'valence <= 0.654\ngini = 0.142\nsamples = 117\nvalue = [108, 9]\nclass = not aroused'),
 Text(8.057761732851986, 301.698, 'web_corpus_freq <= 5.5\ngini = 0.063\nsamples = 92\nvalue = [89, 3]\nclass = not aroused'),
 Text(4.028880866425993, 285.39, 'aoa <= 0.251\ngini = 0.231\nsamples = 15\nvalue = [13, 2]\nclass = not aroused'),
 Text(2.0144404332129966, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(6.04332129963899, 269.082, 'dominance <= 0.498\ngini = 0.133\nsamples = 14\nvalue = [13, 1]\nclass = not aroused'),
 Text(4.028880866425993, 252.774, 'gini = 0.0\nsamples = 10\nvalue = [10, 0]\nclass = not aroused'),
 Text(8.057761732851986, 252.774, 'semsize <= 0.255\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'),
 Text(6.04332129963899, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(10.072202166064983, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(12.08664259927798, 285.39, 'valence <= 0.636\ngini = 0.026\nsamples = 77\nvalue = [76, 1]\nclass = not aroused'),
 Text(10.072202166064983, 269.082, 'gini = 0.0\nsamples = 65\nvalue = [65, 0]\nclass = not aroused'),
 Text(14.101083032490976, 269.082, 'valence <= 0.637\ngini = 0.153\nsamples = 12\nvalue = [11, 1]\nclass = not aroused'),
 Text(12.08664259927798, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(16.115523465703973, 252.774, 'gini = 0.0\nsamples = 11\nvalue = [11, 0]\nclass = not aroused'),
 Text(24.17328519855596, 301.698, 'semsize <= 0.142\ngini = 0.365\nsamples = 25\nvalue = [19, 6]\nclass = not aroused'),
 Text(22.158844765342963, 285.39, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(26.187725631768956, 285.39, 'masculinity <= 0.216\ngini = 0.287\nsamples = 23\nvalue = [19, 4]\nclass = not aroused'),
 Text(22.158844765342963, 269.082, 'dominance <= 0.527\ngini = 0.48\nsamples = 5\nvalue = [2, 3]\nclass = aroused'),
 Text(20.144404332129966, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(24.17328519855596, 252.774, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(30.21660649819495, 269.082, 'familiarity <= 0.565\ngini = 0.105\nsamples = 18\nvalue = [17, 1]\nclass = not aroused'),
 Text(28.202166064981952, 252.774, 'familiarity <= 0.554\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(26.187725631768956, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(30.21660649819495, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(32.231046931407946, 252.774, 'gini = 0.0\nsamples = 15\nvalue = [15, 0]\nclass = not aroused'),
 Text(42.30324909747293, 318.006, 'dominance <= 0.256\ngini = 0.417\nsamples = 54\nvalue = [38, 16]\nclass = not aroused'),
 Text(40.28880866425993, 301.698, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'),
 Text(44.317689530685925, 301.698, 'dominance <= 0.581\ngini = 0.365\nsamples = 50\nvalue = [38, 12]\nclass = not aroused'),
 Text(40.28880866425993, 285.39, 'aoa <= 0.67\ngini = 0.298\nsamples = 44\nvalue = [36, 8]\nclass = not aroused'),
 Text(38.274368231046935, 269.082, 'aoa <= 0.304\ngini = 0.383\nsamples = 31\nvalue = [23, 8]\nclass = not aroused'),
 Text(36.25992779783394, 252.774, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'),
 Text(40.28880866425993, 252.774, 'masculinity <= 0.352\ngini = 0.463\nsamples = 22\nvalue = [14, 8]\nclass = not aroused'),
 Text(38.274368231046935, 236.46599999999998, 'aoa <= 0.433\ngini = 0.42\nsamples = 20\nvalue = [14, 6]\nclass = not aroused'),
 Text(34.24548736462094, 220.158, 'perceivability <= 0.658\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = aroused'),
 Text(32.231046931407946, 203.85, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(36.25992779783394, 203.85, 'familiarity <= 0.821\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(34.24548736462094, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(38.274368231046935, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(42.30324909747293, 220.158, 'familiarity <= 0.706\ngini = 0.245\nsamples = 14\nvalue = [12, 2]\nclass = not aroused'),
 Text(40.28880866425993, 203.85, 'gini = 0.0\nsamples = 10\nvalue = [10, 0]\nclass = not aroused'),
 Text(44.317689530685925, 203.85, 'perceivability <= 0.49\ngini = 0.5\nsamples = 4\nvalue = [2, 2]\nclass = not aroused'),
 Text(42.30324909747293, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(46.33212996389892, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(42.30324909747293, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(42.30324909747293, 269.082, 'gini = 0.0\nsamples = 13\nvalue = [13, 0]\nclass = not aroused'),
 Text(48.34657039711192, 285.39, 'dominance <= 0.629\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = aroused'),
 Text(46.33212996389892, 269.082, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'),
 Text(50.361010830324915, 269.082, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(162.74081960740074, 334.31399999999996, 'masculinity <= 0.696\ngini = 0.08\nsamples = 1371\nvalue = [1314, 57]\nclass = not aroused'),
 Text(134.61340816787006, 318.006, 'valence <= 0.626\ngini = 0.065\nsamples = 1246\nvalue = [1204, 42]\nclass = not aroused'),
 Text(109.83421705776175, 301.698, 'aoa <= 0.971\ngini = 0.051\nsamples = 1116\nvalue = [1087, 29]\nclass = not aroused'),
 Text(95.21378610108304, 285.39, 'valence <= 0.364\ngini = 0.049\nsamples = 1114\nvalue = [1086, 28]\nclass = not aroused'),
 Text(72.01624548736463, 269.082, 'perceivability <= 0.894\ngini = 0.1\nsamples = 245\nvalue = [232, 13]\nclass = not aroused'),
 Text(65.46931407942239, 252.774, 'dominance <= 0.58\ngini = 0.066\nsamples = 233\nvalue = [225, 8]\nclass = not aroused'),
 Text(63.45487364620939, 236.46599999999998, 'aoa <= 0.458\ngini = 0.059\nsamples = 232\nvalue = [225, 7]\nclass = not aroused'),
 Text(56.404332129963905, 220.158, 'aoa <= 0.45\ngini = 0.139\nsamples = 80\nvalue = [74, 6]\nclass = not aroused'),
 Text(52.37545126353791, 203.85, 'familiarity <= 0.408\ngini = 0.098\nsamples = 77\nvalue = [73, 4]\nclass = not aroused'),
 Text(50.361010830324915, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(54.38989169675091, 187.542, 'dominance <= 0.522\ngini = 0.076\nsamples = 76\nvalue = [73, 3]\nclass = not aroused'),
 Text(49.35379061371842, 171.23399999999998, 'perceivability <= 0.861\ngini = 0.053\nsamples = 73\nvalue = [71, 2]\nclass = not aroused'),
 Text(45.32490974729242, 154.926, 'aoa <= 0.24\ngini = 0.029\nsamples = 68\nvalue = [67, 1]\nclass = not aroused'),
 Text(43.31046931407943, 138.618, 'perceivability <= 0.356\ngini = 0.124\nsamples = 15\nvalue = [14, 1]\nclass = not aroused'),
 Text(41.29602888086643, 122.31, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(45.32490974729242, 122.31, 'gini = 0.0\nsamples = 14\nvalue = [14, 0]\nclass = not aroused'),
 Text(47.33935018050542, 138.618, 'gini = 0.0\nsamples = 53\nvalue = [53, 0]\nclass = not aroused'),
 Text(53.38267148014441, 154.926, 'masculinity <= 0.65\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'),
 Text(51.36823104693141, 138.618, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(55.397111913357406, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(59.4259927797834, 171.23399999999998, 'semsize <= 0.39\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(57.4115523465704, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(61.440433212996396, 154.926, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(60.4332129963899, 203.85, 'length <= 5.0\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(58.4187725631769, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(62.447653429602894, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(70.50541516245488, 220.158, 'masculinity <= 0.42\ngini = 0.013\nsamples = 152\nvalue = [151, 1]\nclass = not aroused'),
 Text(68.49097472924188, 203.85, 'masculinity <= 0.412\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'),
 Text(66.47653429602889, 187.542, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'),
 Text(70.50541516245488, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(72.51985559566788, 203.85, 'gini = 0.0\nsamples = 142\nvalue = [142, 0]\nclass = not aroused'),
 Text(67.48375451263539, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(78.56317689530687, 252.774, 'semsize <= 0.182\ngini = 0.486\nsamples = 12\nvalue = [7, 5]\nclass = not aroused'),
 Text(76.54873646209387, 236.46599999999998, 'dominance <= 0.337\ngini = 0.219\nsamples = 8\nvalue = [7, 1]\nclass = not aroused'),
 Text(74.53429602888087, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(78.56317689530687, 220.158, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = not aroused'),
 Text(80.57761732851986, 236.46599999999998, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'),
 Text(118.41132671480146, 269.082, 'familiarity <= 0.858\ngini = 0.034\nsamples = 869\nvalue = [854, 15]\nclass = not aroused'),
 Text(103.86958483754513, 252.774, 'masculinity <= 0.487\ngini = 0.023\nsamples = 760\nvalue = [751, 9]\nclass = not aroused'),
 Text(90.90162454873646, 236.46599999999998, 'masculinity <= 0.486\ngini = 0.053\nsamples = 222\nvalue = [216, 6]\nclass = not aroused'),
 Text(84.10288808664261, 220.158, 'dominance <= 0.514\ngini = 0.045\nsamples = 219\nvalue = [214, 5]\nclass = not aroused'),
 Text(76.54873646209387, 203.85, 'masculinity <= 0.483\ngini = 0.013\nsamples = 156\nvalue = [155, 1]\nclass = not aroused'),
 Text(74.53429602888087, 187.542, 'gini = 0.0\nsamples = 146\nvalue = [146, 0]\nclass = not aroused'),
 Text(78.56317689530687, 187.542, 'masculinity <= 0.483\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'),
 Text(76.54873646209387, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(80.57761732851986, 171.23399999999998, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'),
 Text(91.65703971119135, 203.85, 'dominance <= 0.516\ngini = 0.119\nsamples = 63\nvalue = [59, 4]\nclass = not aroused'),
 Text(86.62093862815885, 187.542, 'web_corpus_freq <= 7.0\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(84.60649819494586, 171.23399999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(88.63537906137185, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(96.69314079422384, 187.542, 'masculinity <= 0.38\ngini = 0.064\nsamples = 60\nvalue = [58, 2]\nclass = not aroused'),
 Text(92.66425992779784, 171.23399999999998, 'semsize <= 0.277\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'),
 Text(90.64981949458485, 154.926, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(94.67870036101084, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(100.72202166064983, 171.23399999999998, 'semsize <= 0.48\ngini = 0.035\nsamples = 56\nvalue = [55, 1]\nclass = not aroused'),
 Text(98.70758122743683, 154.926, 'gini = 0.0\nsamples = 51\nvalue = [51, 0]\nclass = not aroused'),
 Text(102.73646209386283, 154.926, 'perceivability <= 0.592\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'),
 Text(100.72202166064983, 138.618, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(104.75090252707582, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(97.70036101083033, 220.158, 'length <= 8.0\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(95.68592057761734, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(99.71480144404333, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(116.8375451263538, 236.46599999999998, 'aoa <= 0.882\ngini = 0.011\nsamples = 538\nvalue = [535, 3]\nclass = not aroused'),
 Text(110.79422382671481, 220.158, 'masculinity <= 0.665\ngini = 0.008\nsamples = 525\nvalue = [523, 2]\nclass = not aroused'),
 Text(104.75090252707582, 203.85, 'familiarity <= 0.829\ngini = 0.004\nsamples = 494\nvalue = [493, 1]\nclass = not aroused'),
 Text(102.73646209386283, 187.542, 'gini = 0.0\nsamples = 453\nvalue = [453, 0]\nclass = not aroused'),
 Text(106.76534296028882, 187.542, 'familiarity <= 0.831\ngini = 0.048\nsamples = 41\nvalue = [40, 1]\nclass = not aroused'),
 Text(104.75090252707582, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(108.77978339350182, 171.23399999999998, 'gini = 0.0\nsamples = 40\nvalue = [40, 0]\nclass = not aroused'),
 Text(116.8375451263538, 203.85, 'masculinity <= 0.667\ngini = 0.062\nsamples = 31\nvalue = [30, 1]\nclass = not aroused'),
 Text(114.8231046931408, 187.542, 'aoa <= 0.66\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(112.80866425992781, 171.23399999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(116.8375451263538, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(118.8519855595668, 187.542, 'gini = 0.0\nsamples = 28\nvalue = [28, 0]\nclass = not aroused'),
 Text(122.88086642599279, 220.158, 'aoa <= 0.884\ngini = 0.142\nsamples = 13\nvalue = [12, 1]\nclass = not aroused'),
 Text(120.8664259927798, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(124.89530685920579, 203.85, 'gini = 0.0\nsamples = 12\nvalue = [12, 0]\nclass = not aroused'),
 Text(132.95306859205778, 252.774, 'familiarity <= 0.859\ngini = 0.104\nsamples = 109\nvalue = [103, 6]\nclass = not aroused'),
 Text(130.93862815884478, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(134.96750902527077, 236.46599999999998, 'aoa <= 0.627\ngini = 0.088\nsamples = 108\nvalue = [103, 5]\nclass = not aroused'),
 Text(130.93862815884478, 220.158, 'dominance <= 0.558\ngini = 0.056\nsamples = 104\nvalue = [101, 3]\nclass = not aroused'),
 Text(128.92418772563178, 203.85, 'gini = 0.0\nsamples = 76\nvalue = [76, 0]\nclass = not aroused'),
 Text(132.95306859205778, 203.85, 'dominance <= 0.559\ngini = 0.191\nsamples = 28\nvalue = [25, 3]\nclass = not aroused'),
 Text(130.93862815884478, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(134.96750902527077, 187.542, 'masculinity <= 0.423\ngini = 0.137\nsamples = 27\nvalue = [25, 2]\nclass = not aroused'),
 Text(130.93862815884478, 171.23399999999998, 'dominance <= 0.584\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(128.92418772563178, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(132.95306859205778, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(138.99638989169677, 171.23399999999998, 'masculinity <= 0.615\ngini = 0.077\nsamples = 25\nvalue = [24, 1]\nclass = not aroused'),
 Text(136.98194945848377, 154.926, 'gini = 0.0\nsamples = 22\nvalue = [22, 0]\nclass = not aroused'),
 Text(141.01083032490976, 154.926, 'valence <= 0.594\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(138.99638989169677, 138.618, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(143.02527075812276, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(138.99638989169677, 220.158, 'familiarity <= 0.866\ngini = 0.5\nsamples = 4\nvalue = [2, 2]\nclass = not aroused'),
 Text(136.98194945848377, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(141.01083032490976, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(124.45464801444045, 285.39, 'perceivability <= 0.756\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(122.44020758122745, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(126.46908844765345, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(159.39259927797835, 301.698, 'aoa <= 0.204\ngini = 0.18\nsamples = 130\nvalue = [117, 13]\nclass = not aroused'),
 Text(157.37815884476535, 285.39, 'gini = 0.0\nsamples = 30\nvalue = [30, 0]\nclass = not aroused'),
 Text(161.40703971119135, 285.39, 'dominance <= 0.715\ngini = 0.226\nsamples = 100\nvalue = [87, 13]\nclass = not aroused'),
 Text(159.39259927797835, 269.082, 'masculinity <= 0.446\ngini = 0.213\nsamples = 99\nvalue = [87, 12]\nclass = not aroused'),
 Text(157.37815884476535, 252.774, 'gini = 0.0\nsamples = 31\nvalue = [31, 0]\nclass = not aroused'),
 Text(161.40703971119135, 252.774, 'masculinity <= 0.482\ngini = 0.291\nsamples = 68\nvalue = [56, 12]\nclass = not aroused'),
 Text(151.08303249097474, 236.46599999999998, 'length <= 5.5\ngini = 0.473\nsamples = 13\nvalue = [8, 5]\nclass = not aroused'),
 Text(147.05415162454875, 220.158, 'dominance <= 0.51\ngini = 0.346\nsamples = 9\nvalue = [7, 2]\nclass = not aroused'),
 Text(145.03971119133575, 203.85, 'semsize <= 0.375\ngini = 0.5\nsamples = 4\nvalue = [2, 2]\nclass = not aroused'),
 Text(143.02527075812276, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(147.05415162454875, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(149.06859205776175, 203.85, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'),
 Text(155.11191335740074, 220.158, 'semsize <= 0.451\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = aroused'),
 Text(153.09747292418774, 203.85, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(157.12635379061373, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(171.73104693140795, 236.46599999999998, 'valence <= 0.628\ngini = 0.222\nsamples = 55\nvalue = [48, 7]\nclass = not aroused'),
 Text(163.16967509025272, 220.158, 'familiarity <= 0.59\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'),
 Text(161.15523465703973, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(165.18411552346572, 203.85, 'semsize <= 0.455\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(163.16967509025272, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(167.19855595667872, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(180.2924187725632, 220.158, 'aoa <= 0.245\ngini = 0.18\nsamples = 50\nvalue = [45, 5]\nclass = not aroused'),
 Text(173.2418772563177, 203.85, 'semsize <= 0.224\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'),
 Text(171.2274368231047, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(175.2563176895307, 187.542, 'polysemy <= 0.5\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(173.2418772563177, 171.23399999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(177.2707581227437, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(187.34296028880868, 203.85, 'familiarity <= 0.651\ngini = 0.124\nsamples = 45\nvalue = [42, 3]\nclass = not aroused'),
 Text(183.3140794223827, 187.542, 'perceivability <= 0.787\ngini = 0.408\nsamples = 7\nvalue = [5, 2]\nclass = not aroused'),
 Text(181.2996389891697, 171.23399999999998, 'familiarity <= 0.546\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(179.2851985559567, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(183.3140794223827, 154.926, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(185.3285198555957, 171.23399999999998, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(191.37184115523468, 187.542, 'familiarity <= 0.828\ngini = 0.051\nsamples = 38\nvalue = [37, 1]\nclass = not aroused'),
 Text(189.35740072202168, 171.23399999999998, 'gini = 0.0\nsamples = 28\nvalue = [28, 0]\nclass = not aroused'),
 Text(193.38628158844767, 171.23399999999998, 'familiarity <= 0.831\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'),
 Text(191.37184115523468, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(195.40072202166067, 154.926, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'),
 Text(163.42148014440434, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(190.86823104693144, 318.006, 'valence <= 0.234\ngini = 0.211\nsamples = 125\nvalue = [110, 15]\nclass = not aroused'),
 Text(183.3140794223827, 301.698, 'perceivability <= 0.584\ngini = 0.48\nsamples = 10\nvalue = [4, 6]\nclass = aroused'),
 Text(181.2996389891697, 285.39, 'length <= 4.5\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'),
 Text(179.2851985559567, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(183.3140794223827, 269.082, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(185.3285198555957, 285.39, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'),
 Text(198.42238267148016, 301.698, 'dominance <= 0.608\ngini = 0.144\nsamples = 115\nvalue = [106, 9]\nclass = not aroused'),
 Text(189.35740072202168, 285.39, 'masculinity <= 0.699\ngini = 0.076\nsamples = 101\nvalue = [97, 4]\nclass = not aroused'),
 Text(187.34296028880868, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(191.37184115523468, 269.082, 'familiarity <= 0.141\ngini = 0.058\nsamples = 100\nvalue = [97, 3]\nclass = not aroused'),
 Text(187.34296028880868, 252.774, 'familiarity <= 0.098\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(185.3285198555957, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(189.35740072202168, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(195.40072202166067, 252.774, 'perceivability <= 0.895\ngini = 0.04\nsamples = 98\nvalue = [96, 2]\nclass = not aroused'),
 Text(193.38628158844767, 236.46599999999998, 'gini = 0.0\nsamples = 62\nvalue = [62, 0]\nclass = not aroused'),
 Text(197.41516245487367, 236.46599999999998, 'perceivability <= 0.896\ngini = 0.105\nsamples = 36\nvalue = [34, 2]\nclass = not aroused'),
 Text(195.40072202166067, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(199.42960288808666, 220.158, 'valence <= 0.329\ngini = 0.056\nsamples = 35\nvalue = [34, 1]\nclass = not aroused'),
 Text(197.41516245487367, 203.85, 'valence <= 0.293\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(195.40072202166067, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(199.42960288808666, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(201.44404332129966, 203.85, 'gini = 0.0\nsamples = 32\nvalue = [32, 0]\nclass = not aroused'),
 Text(207.48736462093865, 285.39, 'semsize <= 0.447\ngini = 0.459\nsamples = 14\nvalue = [9, 5]\nclass = not aroused'),
 Text(205.47292418772565, 269.082, 'aoa <= 0.752\ngini = 0.298\nsamples = 11\nvalue = [9, 2]\nclass = not aroused'),
 Text(203.45848375451266, 252.774, 'dominance <= 0.622\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'),
 Text(201.44404332129966, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(205.47292418772565, 236.46599999999998, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'),
 Text(207.48736462093865, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(209.50180505415165, 269.082, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(275.15013678925993, 350.62199999999996, 'masculinity <= 0.239\ngini = 0.276\nsamples = 757\nvalue = [632, 125]\nclass = not aroused'),
 Text(229.88227662454875, 334.31399999999996, 'masculinity <= 0.075\ngini = 0.298\nsamples = 11\nvalue = [2, 9]\nclass = aroused'),
 Text(227.86783619133575, 318.006, 'semsize <= 0.537\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(225.85339575812276, 301.698, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(229.88227662454875, 301.698, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(231.89671705776175, 318.006, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = aroused'),
 Text(320.41799695397117, 334.31399999999996, 'dominance <= 0.603\ngini = 0.263\nsamples = 746\nvalue = [630, 116]\nclass = not aroused'),
 Text(277.73310582129966, 318.006, 'masculinity <= 0.66\ngini = 0.225\nsamples = 658\nvalue = [573, 85]\nclass = not aroused'),
 Text(233.91115749097474, 301.698, 'masculinity <= 0.405\ngini = 0.189\nsamples = 538\nvalue = [481, 57]\nclass = not aroused'),
 Text(215.54512635379064, 285.39, 'dominance <= 0.347\ngini = 0.38\nsamples = 51\nvalue = [38, 13]\nclass = not aroused'),
 Text(213.53068592057764, 269.082, 'gini = 0.0\nsamples = 12\nvalue = [12, 0]\nclass = not aroused'),
 Text(217.55956678700363, 269.082, 'valence <= 0.488\ngini = 0.444\nsamples = 39\nvalue = [26, 13]\nclass = not aroused'),
 Text(211.51624548736464, 252.774, 'perceivability <= 0.281\ngini = 0.375\nsamples = 8\nvalue = [2, 6]\nclass = aroused'),
 Text(209.50180505415165, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(213.53068592057764, 236.46599999999998, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(223.60288808664262, 252.774, 'masculinity <= 0.319\ngini = 0.35\nsamples = 31\nvalue = [24, 7]\nclass = not aroused'),
 Text(217.55956678700363, 236.46599999999998, 'masculinity <= 0.279\ngini = 0.5\nsamples = 8\nvalue = [4, 4]\nclass = not aroused'),
 Text(215.54512635379064, 220.158, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(219.57400722021663, 220.158, 'familiarity <= 0.712\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = aroused'),
 Text(217.55956678700363, 203.85, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'),
 Text(221.58844765342963, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(229.6462093862816, 236.46599999999998, 'valence <= 0.692\ngini = 0.227\nsamples = 23\nvalue = [20, 3]\nclass = not aroused'),
 Text(227.63176895306862, 220.158, 'familiarity <= 0.828\ngini = 0.165\nsamples = 22\nvalue = [20, 2]\nclass = not aroused'),
 Text(225.61732851985562, 203.85, 'gini = 0.0\nsamples = 15\nvalue = [15, 0]\nclass = not aroused'),
 Text(229.6462093862816, 203.85, 'masculinity <= 0.392\ngini = 0.408\nsamples = 7\nvalue = [5, 2]\nclass = not aroused'),
 Text(227.63176895306862, 187.542, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(231.6606498194946, 187.542, 'aoa <= 0.113\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(229.6462093862816, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(233.6750902527076, 171.23399999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(231.6606498194946, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(252.27718862815888, 285.39, 'perceivability <= 0.143\ngini = 0.164\nsamples = 487\nvalue = [443, 44]\nclass = not aroused'),
 Text(242.7400722021661, 269.082, 'semsize <= 0.619\ngini = 0.436\nsamples = 28\nvalue = [19, 9]\nclass = not aroused'),
 Text(240.7256317689531, 252.774, 'perceivability <= 0.135\ngini = 0.365\nsamples = 25\nvalue = [19, 6]\nclass = not aroused'),
 Text(238.7111913357401, 236.46599999999998, 'semsize <= 0.568\ngini = 0.287\nsamples = 23\nvalue = [19, 4]\nclass = not aroused'),
 Text(236.6967509025271, 220.158, 'gini = 0.0\nsamples = 12\nvalue = [12, 0]\nclass = not aroused'),
 Text(240.7256317689531, 220.158, 'semsize <= 0.579\ngini = 0.463\nsamples = 11\nvalue = [7, 4]\nclass = not aroused'),
 Text(238.7111913357401, 203.85, 'aoa <= 0.882\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = aroused'),
 Text(236.6967509025271, 187.542, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'),
 Text(240.7256317689531, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(242.7400722021661, 203.85, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'),
 Text(242.7400722021661, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(244.7545126353791, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(261.81430505415165, 269.082, 'aoa <= 0.051\ngini = 0.141\nsamples = 459\nvalue = [424, 35]\nclass = not aroused'),
 Text(259.79986462093865, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(263.82874548736464, 252.774, 'dominance <= 0.057\ngini = 0.137\nsamples = 458\nvalue = [424, 34]\nclass = not aroused'),
 Text(253.31588447653434, 236.46599999999998, 'semsize <= 0.579\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'),
 Text(251.30144404332134, 220.158, 'familiarity <= 0.566\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(249.28700361010834, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(253.31588447653434, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(255.33032490974733, 220.158, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(274.34160649819495, 236.46599999999998, 'aoa <= 0.418\ngini = 0.131\nsamples = 453\nvalue = [421, 32]\nclass = not aroused'),
 Text(259.3592057761733, 220.158, 'aoa <= 0.412\ngini = 0.22\nsamples = 119\nvalue = [104, 15]\nclass = not aroused'),
 Text(257.34476534296033, 203.85, 'web_corpus_freq <= 6.5\ngini = 0.198\nsamples = 117\nvalue = [104, 13]\nclass = not aroused'),
 Text(244.7545126353791, 187.542, 'perceivability <= 0.348\ngini = 0.363\nsamples = 42\nvalue = [32, 10]\nclass = not aroused'),
 Text(237.7039711191336, 171.23399999999998, 'familiarity <= 0.801\ngini = 0.496\nsamples = 11\nvalue = [5, 6]\nclass = aroused'),
 Text(235.6895306859206, 154.926, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'),
 Text(239.7184115523466, 154.926, 'perceivability <= 0.335\ngini = 0.278\nsamples = 6\nvalue = [5, 1]\nclass = not aroused'),
 Text(237.7039711191336, 138.618, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'),
 Text(241.7328519855596, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(251.80505415162457, 171.23399999999998, 'valence <= 0.604\ngini = 0.225\nsamples = 31\nvalue = [27, 4]\nclass = not aroused'),
 Text(247.77617328519858, 154.926, 'aoa <= 0.209\ngini = 0.087\nsamples = 22\nvalue = [21, 1]\nclass = not aroused'),
 Text(245.76173285198558, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(249.79061371841158, 138.618, 'gini = 0.0\nsamples = 21\nvalue = [21, 0]\nclass = not aroused'),
 Text(255.83393501805057, 154.926, 'aoa <= 0.375\ngini = 0.444\nsamples = 9\nvalue = [6, 3]\nclass = not aroused'),
 Text(253.81949458483757, 138.618, 'valence <= 0.607\ngini = 0.245\nsamples = 7\nvalue = [6, 1]\nclass = not aroused'),
 Text(251.80505415162457, 122.31, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(255.83393501805057, 122.31, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'),
 Text(257.84837545126356, 138.618, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(269.93501805054154, 187.542, 'valence <= 0.691\ngini = 0.077\nsamples = 75\nvalue = [72, 3]\nclass = not aroused'),
 Text(267.92057761732855, 171.23399999999998, 'familiarity <= 0.956\ngini = 0.053\nsamples = 74\nvalue = [72, 2]\nclass = not aroused'),
 Text(263.89169675090255, 154.926, 'length <= 3.5\ngini = 0.028\nsamples = 71\nvalue = [70, 1]\nclass = not aroused'),
 Text(261.87725631768956, 138.618, 'perceivability <= 0.244\ngini = 0.245\nsamples = 7\nvalue = [6, 1]\nclass = not aroused'),
 Text(259.86281588447656, 122.31, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(263.89169675090255, 122.31, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'),
 Text(265.90613718411555, 138.618, 'gini = 0.0\nsamples = 64\nvalue = [64, 0]\nclass = not aroused'),
 Text(271.94945848375454, 154.926, 'aoa <= 0.219\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(269.93501805054154, 138.618, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(273.96389891696754, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(271.94945848375454, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(261.3736462093863, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(289.32400722021663, 220.158, 'semsize <= 0.633\ngini = 0.097\nsamples = 334\nvalue = [317, 17]\nclass = not aroused'),
 Text(285.29512635379064, 203.85, 'familiarity <= 0.456\ngini = 0.088\nsamples = 326\nvalue = [311, 15]\nclass = not aroused'),
 Text(283.28068592057764, 187.542, 'gini = 0.0\nsamples = 75\nvalue = [75, 0]\nclass = not aroused'),
 Text(287.30956678700363, 187.542, 'familiarity <= 0.459\ngini = 0.112\nsamples = 251\nvalue = [236, 15]\nclass = not aroused'),
 Text(285.29512635379064, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(289.32400722021663, 171.23399999999998, 'aoa <= 0.917\ngini = 0.106\nsamples = 250\nvalue = [236, 14]\nclass = not aroused'),
 Text(283.5324909747293, 154.926, 'perceivability <= 0.207\ngini = 0.099\nsamples = 248\nvalue = [235, 13]\nclass = not aroused'),
 Text(277.99277978339353, 138.618, 'perceivability <= 0.203\ngini = 0.287\nsamples = 23\nvalue = [19, 4]\nclass = not aroused'),
 Text(275.97833935018053, 122.31, 'length <= 4.5\ngini = 0.172\nsamples = 21\nvalue = [19, 2]\nclass = not aroused'),
 Text(273.96389891696754, 106.00200000000001, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(277.99277978339353, 106.00200000000001, 'dominance <= 0.188\ngini = 0.095\nsamples = 20\nvalue = [19, 1]\nclass = not aroused'),
 Text(275.97833935018053, 89.69400000000002, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(280.0072202166065, 89.69400000000002, 'gini = 0.0\nsamples = 19\nvalue = [19, 0]\nclass = not aroused'),
 Text(280.0072202166065, 122.31, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(289.07220216606504, 138.618, 'familiarity <= 0.471\ngini = 0.077\nsamples = 225\nvalue = [216, 9]\nclass = not aroused'),
 Text(284.0361010830325, 122.31, 'dominance <= 0.459\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'),
 Text(282.0216606498195, 106.00200000000001, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(286.0505415162455, 106.00200000000001, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(294.1083032490975, 122.31, 'dominance <= 0.39\ngini = 0.07\nsamples = 221\nvalue = [213, 8]\nclass = not aroused'),
 Text(290.0794223826715, 106.00200000000001, 'dominance <= 0.387\ngini = 0.133\nsamples = 98\nvalue = [91, 7]\nclass = not aroused'),
 Text(288.0649819494585, 89.69400000000002, 'length <= 5.5\ngini = 0.116\nsamples = 97\nvalue = [91, 6]\nclass = not aroused'),
 Text(284.0361010830325, 73.38599999999997, 'masculinity <= 0.629\ngini = 0.363\nsamples = 21\nvalue = [16, 5]\nclass = not aroused'),
 Text(282.0216606498195, 57.077999999999975, 'aoa <= 0.494\ngini = 0.266\nsamples = 19\nvalue = [16, 3]\nclass = not aroused'),
 Text(277.99277978339353, 40.76999999999998, 'valence <= 0.31\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(275.97833935018053, 24.46199999999999, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(280.0072202166065, 24.46199999999999, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(286.0505415162455, 40.76999999999998, 'semsize <= 0.513\ngini = 0.117\nsamples = 16\nvalue = [15, 1]\nclass = not aroused'),
 Text(284.0361010830325, 24.46199999999999, 'aoa <= 0.759\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(282.0216606498195, 8.153999999999996, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(286.0505415162455, 8.153999999999996, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(288.0649819494585, 24.46199999999999, 'gini = 0.0\nsamples = 14\nvalue = [14, 0]\nclass = not aroused'),
 Text(286.0505415162455, 57.077999999999975, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(292.0938628158845, 73.38599999999997, 'valence <= 0.486\ngini = 0.026\nsamples = 76\nvalue = [75, 1]\nclass = not aroused'),
 Text(290.0794223826715, 57.077999999999975, 'gini = 0.0\nsamples = 67\nvalue = [67, 0]\nclass = not aroused'),
 Text(294.1083032490975, 57.077999999999975, 'perceivability <= 0.277\ngini = 0.198\nsamples = 9\nvalue = [8, 1]\nclass = not aroused'),
 Text(292.0938628158845, 40.76999999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(296.1227436823105, 40.76999999999998, 'gini = 0.0\nsamples = 8\nvalue = [8, 0]\nclass = not aroused'),
 Text(292.0938628158845, 89.69400000000002, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(298.1371841155235, 106.00200000000001, 'familiarity <= 0.868\ngini = 0.016\nsamples = 123\nvalue = [122, 1]\nclass = not aroused'),
 Text(296.1227436823105, 89.69400000000002, 'gini = 0.0\nsamples = 119\nvalue = [119, 0]\nclass = not aroused'),
 Text(300.1516245487365, 89.69400000000002, 'dominance <= 0.464\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'),
 Text(298.1371841155235, 73.38599999999997, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(302.1660649819495, 73.38599999999997, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(295.115523465704, 154.926, 'masculinity <= 0.534\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(293.10108303249103, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(297.129963898917, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(293.3528880866426, 203.85, 'perceivability <= 0.48\ngini = 0.375\nsamples = 8\nvalue = [6, 2]\nclass = not aroused'),
 Text(291.3384476534296, 187.542, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'),
 Text(295.3673285198556, 187.542, 'perceivability <= 0.674\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(293.3528880866426, 171.23399999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(297.3817689530686, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(321.5550541516246, 301.698, 'aoa <= 0.415\ngini = 0.358\nsamples = 120\nvalue = [92, 28]\nclass = not aroused'),
 Text(316.5189530685921, 285.39, 'perceivability <= 0.902\ngini = 0.499\nsamples = 19\nvalue = [9, 10]\nclass = aroused'),
 Text(312.4900722021661, 269.082, 'masculinity <= 0.708\ngini = 0.346\nsamples = 9\nvalue = [2, 7]\nclass = aroused'),
 Text(310.4756317689531, 252.774, 'masculinity <= 0.67\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(308.4611913357401, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(312.4900722021661, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(314.5045126353791, 252.774, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(320.5478339350181, 269.082, 'masculinity <= 0.77\ngini = 0.42\nsamples = 10\nvalue = [7, 3]\nclass = not aroused'),
 Text(318.5333935018051, 252.774, 'perceivability <= 0.932\ngini = 0.48\nsamples = 5\nvalue = [2, 3]\nclass = aroused'),
 Text(316.5189530685921, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(320.5478339350181, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(322.5622743682311, 252.774, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'),
 Text(326.5911552346571, 285.39, 'perceivability <= 0.166\ngini = 0.293\nsamples = 101\nvalue = [83, 18]\nclass = not aroused'),
 Text(324.5767148014441, 269.082, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(328.6055956678701, 269.082, 'familiarity <= 0.904\ngini = 0.271\nsamples = 99\nvalue = [83, 16]\nclass = not aroused'),
 Text(326.5911552346571, 252.774, 'semsize <= 0.496\ngini = 0.259\nsamples = 98\nvalue = [83, 15]\nclass = not aroused'),
 Text(324.5767148014441, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(328.6055956678701, 236.46599999999998, 'valence <= 0.489\ngini = 0.247\nsamples = 97\nvalue = [83, 14]\nclass = not aroused'),
 Text(321.8068592057762, 220.158, 'masculinity <= 0.877\ngini = 0.316\nsamples = 61\nvalue = [49, 12]\nclass = not aroused'),
 Text(319.7924187725632, 203.85, 'dominance <= 0.492\ngini = 0.282\nsamples = 59\nvalue = [49, 10]\nclass = not aroused'),
 Text(312.2382671480145, 187.542, 'perceivability <= 0.757\ngini = 0.159\nsamples = 46\nvalue = [42, 4]\nclass = not aroused'),
 Text(307.20216606498195, 171.23399999999998, 'aoa <= 0.484\ngini = 0.089\nsamples = 43\nvalue = [41, 2]\nclass = not aroused'),
 Text(303.17328519855596, 154.926, 'dominance <= 0.432\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'),
 Text(301.158844765343, 138.618, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(305.187725631769, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(311.23104693140795, 154.926, 'valence <= 0.148\ngini = 0.05\nsamples = 39\nvalue = [38, 1]\nclass = not aroused'),
 Text(309.216606498195, 138.618, 'dominance <= 0.371\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'),
 Text(307.20216606498195, 122.31, 'gini = 0.0\nsamples = 8\nvalue = [8, 0]\nclass = not aroused'),
 Text(311.23104693140795, 122.31, 'familiarity <= 0.555\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(309.216606498195, 106.00200000000001, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(313.245487364621, 106.00200000000001, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(313.245487364621, 138.618, 'gini = 0.0\nsamples = 29\nvalue = [29, 0]\nclass = not aroused'),
 Text(317.274368231047, 171.23399999999998, 'perceivability <= 0.933\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(315.25992779783394, 154.926, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(319.28880866425993, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(327.3465703971119, 187.542, 'masculinity <= 0.788\ngini = 0.497\nsamples = 13\nvalue = [7, 6]\nclass = not aroused'),
 Text(325.332129963899, 171.23399999999998, 'dominance <= 0.58\ngini = 0.245\nsamples = 7\nvalue = [1, 6]\nclass = aroused'),
 Text(323.3176895306859, 154.926, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(327.3465703971119, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(329.361010830325, 171.23399999999998, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'),
 Text(323.8212996389892, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(335.4043321299639, 220.158, 'semsize <= 0.611\ngini = 0.105\nsamples = 36\nvalue = [34, 2]\nclass = not aroused'),
 Text(333.38989169675096, 203.85, 'familiarity <= 0.775\ngini = 0.056\nsamples = 35\nvalue = [34, 1]\nclass = not aroused'),
 Text(331.3754512635379, 187.542, 'gini = 0.0\nsamples = 31\nvalue = [31, 0]\nclass = not aroused'),
 Text(335.4043321299639, 187.542, 'aoa <= 0.488\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'),
 Text(333.38989169675096, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(337.41877256317696, 171.23399999999998, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(337.41877256317696, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(330.6200361010831, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(363.1028880866426, 318.006, 'perceivability <= 0.686\ngini = 0.456\nsamples = 88\nvalue = [57, 31]\nclass = not aroused'),
 Text(350.5126353790614, 301.698, 'aoa <= 0.876\ngini = 0.399\nsamples = 69\nvalue = [50, 19]\nclass = not aroused'),
 Text(348.4981949458484, 285.39, 'dominance <= 0.622\ngini = 0.379\nsamples = 67\nvalue = [50, 17]\nclass = not aroused'),
 Text(341.44765342960295, 269.082, 'dominance <= 0.619\ngini = 0.492\nsamples = 16\nvalue = [9, 7]\nclass = not aroused'),
 Text(339.4332129963899, 252.774, 'aoa <= 0.402\ngini = 0.375\nsamples = 12\nvalue = [9, 3]\nclass = not aroused'),
 Text(337.41877256317696, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(341.44765342960295, 236.46599999999998, 'semsize <= 0.513\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'),
 Text(339.4332129963899, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(343.4620938628159, 220.158, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'),
 Text(343.4620938628159, 252.774, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'),
 Text(355.54873646209387, 269.082, 'semsize <= 0.586\ngini = 0.315\nsamples = 51\nvalue = [41, 10]\nclass = not aroused'),
 Text(347.4909747292419, 252.774, 'masculinity <= 0.748\ngini = 0.069\nsamples = 28\nvalue = [27, 1]\nclass = not aroused'),
 Text(345.47653429602894, 236.46599999999998, 'gini = 0.0\nsamples = 26\nvalue = [26, 0]\nclass = not aroused'),
 Text(349.50541516245494, 236.46599999999998, 'perceivability <= 0.514\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(347.4909747292419, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(351.5198555956679, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(363.60649819494586, 252.774, 'semsize <= 0.608\ngini = 0.476\nsamples = 23\nvalue = [14, 9]\nclass = not aroused'),
 Text(357.5631768953069, 236.46599999999998, 'masculinity <= 0.543\ngini = 0.463\nsamples = 11\nvalue = [4, 7]\nclass = aroused'),
 Text(355.54873646209387, 220.158, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(359.57761732851986, 220.158, 'valence <= 0.38\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'),
 Text(357.5631768953069, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(361.5920577617329, 203.85, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(369.6498194945849, 236.46599999999998, 'masculinity <= 0.741\ngini = 0.278\nsamples = 12\nvalue = [10, 2]\nclass = not aroused'),
 Text(367.63537906137185, 220.158, 'valence <= 0.695\ngini = 0.165\nsamples = 11\nvalue = [10, 1]\nclass = not aroused'),
 Text(365.6209386281589, 203.85, 'gini = 0.0\nsamples = 10\nvalue = [10, 0]\nclass = not aroused'),
 Text(369.6498194945849, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(371.66425992779784, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(352.5270758122744, 285.39, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(375.69314079422384, 301.698, 'web_corpus_freq <= 6.5\ngini = 0.465\nsamples = 19\nvalue = [7, 12]\nclass = aroused'),
 Text(371.66425992779784, 285.39, 'masculinity <= 0.679\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = aroused'),
 Text(369.6498194945849, 269.082, 'valence <= 0.617\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(367.63537906137185, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(371.66425992779784, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(373.6787003610109, 269.082, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = aroused'),
 Text(379.72202166064983, 285.39, 'aoa <= 0.329\ngini = 0.444\nsamples = 9\nvalue = [6, 3]\nclass = not aroused'),
 Text(377.7075812274369, 269.082, 'semsize <= 0.597\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = aroused'),
 Text(375.69314079422384, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(379.72202166064983, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(381.7364620938629, 269.082, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'),
 Text(525.1000958935018, 366.93, 'dominance <= 0.608\ngini = 0.477\nsamples = 530\nvalue = [322, 208]\nclass = not aroused'),
 Text(482.12798962093865, 350.62199999999996, 'valence <= 0.402\ngini = 0.452\nsamples = 460\nvalue = [301, 159]\nclass = not aroused'),
 Text(440.1552346570397, 334.31399999999996, 'dominance <= 0.33\ngini = 0.499\nsamples = 244\nvalue = [128, 116]\nclass = not aroused'),
 Text(408.9314079422383, 318.006, 'semsize <= 0.824\ngini = 0.456\nsamples = 131\nvalue = [85, 46]\nclass = not aroused'),
 Text(406.9169675090253, 301.698, 'aoa <= 0.549\ngini = 0.408\nsamples = 119\nvalue = [85, 34]\nclass = not aroused'),
 Text(393.82310469314086, 285.39, 'perceivability <= 0.474\ngini = 0.5\nsamples = 41\nvalue = [20, 21]\nclass = aroused'),
 Text(385.7653429602889, 269.082, 'masculinity <= 0.447\ngini = 0.403\nsamples = 25\nvalue = [7, 18]\nclass = aroused'),
 Text(383.7509025270758, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(387.7797833935018, 252.774, 'familiarity <= 0.82\ngini = 0.298\nsamples = 22\nvalue = [4, 18]\nclass = aroused'),
 Text(383.7509025270758, 236.46599999999998, 'aoa <= 0.309\ngini = 0.111\nsamples = 17\nvalue = [1, 16]\nclass = aroused'),
 Text(381.7364620938629, 220.158, 'aoa <= 0.256\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(379.72202166064983, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(383.7509025270758, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(385.7653429602889, 220.158, 'gini = 0.0\nsamples = 14\nvalue = [0, 14]\nclass = aroused'),
 Text(391.8086642599278, 236.46599999999998, 'perceivability <= 0.409\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'),
 Text(389.79422382671487, 220.158, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(393.82310469314086, 220.158, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(401.88086642599285, 269.082, 'dominance <= 0.28\ngini = 0.305\nsamples = 16\nvalue = [13, 3]\nclass = not aroused'),
 Text(397.85198555956686, 252.774, 'perceivability <= 0.86\ngini = 0.142\nsamples = 13\nvalue = [12, 1]\nclass = not aroused'),
 Text(395.8375451263538, 236.46599999999998, 'gini = 0.0\nsamples = 11\nvalue = [11, 0]\nclass = not aroused'),
 Text(399.8664259927798, 236.46599999999998, 'masculinity <= 0.656\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(397.85198555956686, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(401.88086642599285, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(405.90974729241884, 252.774, 'length <= 6.0\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(403.8953068592058, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(407.9241877256318, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(420.01083032490976, 285.39, 'web_corpus_freq <= 5.5\ngini = 0.278\nsamples = 78\nvalue = [65, 13]\nclass = not aroused'),
 Text(417.9963898916968, 269.082, 'gini = 0.0\nsamples = 21\nvalue = [21, 0]\nclass = not aroused'),
 Text(422.0252707581228, 269.082, 'valence <= 0.163\ngini = 0.352\nsamples = 57\nvalue = [44, 13]\nclass = not aroused'),
 Text(413.96750902527083, 252.774, 'perceivability <= 0.158\ngini = 0.208\nsamples = 34\nvalue = [30, 4]\nclass = not aroused'),
 Text(411.9530685920578, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(415.98194945848377, 236.46599999999998, 'valence <= 0.068\ngini = 0.165\nsamples = 33\nvalue = [30, 3]\nclass = not aroused'),
 Text(411.9530685920578, 220.158, 'semsize <= 0.757\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'),
 Text(409.93862815884484, 203.85, 'perceivability <= 0.553\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(407.9241877256318, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(411.9530685920578, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(413.96750902527083, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(420.01083032490976, 220.158, 'semsize <= 0.652\ngini = 0.069\nsamples = 28\nvalue = [27, 1]\nclass = not aroused'),
 Text(417.9963898916968, 203.85, 'length <= 6.5\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(415.98194945848377, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(420.01083032490976, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(422.0252707581228, 203.85, 'gini = 0.0\nsamples = 25\nvalue = [25, 0]\nclass = not aroused'),
 Text(430.0830324909748, 252.774, 'dominance <= 0.176\ngini = 0.476\nsamples = 23\nvalue = [14, 9]\nclass = not aroused'),
 Text(426.0541516245488, 236.46599999999998, 'perceivability <= 0.242\ngini = 0.375\nsamples = 8\nvalue = [2, 6]\nclass = aroused'),
 Text(424.03971119133575, 220.158, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(428.06859205776175, 220.158, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(434.1119133574008, 236.46599999999998, 'familiarity <= 0.367\ngini = 0.32\nsamples = 15\nvalue = [12, 3]\nclass = not aroused'),
 Text(432.09747292418774, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(436.12635379061373, 220.158, 'valence <= 0.325\ngini = 0.245\nsamples = 14\nvalue = [12, 2]\nclass = not aroused'),
 Text(432.09747292418774, 203.85, 'masculinity <= 0.538\ngini = 0.153\nsamples = 12\nvalue = [11, 1]\nclass = not aroused'),
 Text(430.0830324909748, 187.542, 'familiarity <= 0.804\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(428.06859205776175, 171.23399999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(432.09747292418774, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(434.1119133574008, 187.542, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'),
 Text(440.1552346570397, 203.85, 'valence <= 0.338\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(438.1407942238268, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(442.1696750902528, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(410.9458483754513, 301.698, 'gini = 0.0\nsamples = 12\nvalue = [0, 12]\nclass = aroused'),
 Text(471.3790613718412, 318.006, 'masculinity <= 0.755\ngini = 0.471\nsamples = 113\nvalue = [43, 70]\nclass = aroused'),
 Text(464.3285198555957, 301.698, 'semsize <= 0.737\ngini = 0.497\nsamples = 80\nvalue = [37, 43]\nclass = aroused'),
 Text(456.2707581227437, 285.39, 'familiarity <= 0.579\ngini = 0.495\nsamples = 58\nvalue = [32, 26]\nclass = not aroused'),
 Text(448.2129963898917, 269.082, 'familiarity <= 0.476\ngini = 0.465\nsamples = 19\nvalue = [7, 12]\nclass = aroused'),
 Text(444.1841155234657, 252.774, 'dominance <= 0.401\ngini = 0.444\nsamples = 9\nvalue = [6, 3]\nclass = not aroused'),
 Text(442.1696750902528, 236.46599999999998, 'semsize <= 0.715\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = aroused'),
 Text(440.1552346570397, 220.158, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(444.1841155234657, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(446.1985559566788, 236.46599999999998, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'),
 Text(452.2418772563177, 252.774, 'dominance <= 0.418\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = aroused'),
 Text(450.22743682310477, 236.46599999999998, 'semsize <= 0.677\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(448.2129963898917, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(452.2418772563177, 220.158, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(454.25631768953076, 236.46599999999998, 'gini = 0.0\nsamples = 7\nvalue = [0, 7]\nclass = aroused'),
 Text(464.3285198555957, 269.082, 'valence <= 0.207\ngini = 0.46\nsamples = 39\nvalue = [25, 14]\nclass = not aroused'),
 Text(460.2996389891697, 252.774, 'aoa <= 0.241\ngini = 0.245\nsamples = 21\nvalue = [18, 3]\nclass = not aroused'),
 Text(458.28519855595675, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(462.31407942238275, 236.46599999999998, 'perceivability <= 0.769\ngini = 0.18\nsamples = 20\nvalue = [18, 2]\nclass = not aroused'),
 Text(460.2996389891697, 220.158, 'familiarity <= 0.873\ngini = 0.1\nsamples = 19\nvalue = [18, 1]\nclass = not aroused'),
 Text(458.28519855595675, 203.85, 'gini = 0.0\nsamples = 17\nvalue = [17, 0]\nclass = not aroused'),
 Text(462.31407942238275, 203.85, 'perceivability <= 0.584\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(460.2996389891697, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(464.3285198555957, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(464.3285198555957, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(468.3574007220217, 252.774, 'masculinity <= 0.61\ngini = 0.475\nsamples = 18\nvalue = [7, 11]\nclass = aroused'),
 Text(466.34296028880874, 236.46599999999998, 'gini = 0.0\nsamples = 7\nvalue = [0, 7]\nclass = aroused'),
 Text(470.37184115523473, 236.46599999999998, 'dominance <= 0.393\ngini = 0.463\nsamples = 11\nvalue = [7, 4]\nclass = not aroused'),
 Text(468.3574007220217, 220.158, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(472.3862815884477, 220.158, 'semsize <= 0.714\ngini = 0.219\nsamples = 8\nvalue = [7, 1]\nclass = not aroused'),
 Text(470.37184115523473, 203.85, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = not aroused'),
 Text(474.4007220216607, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(472.3862815884477, 285.39, 'aoa <= 0.589\ngini = 0.351\nsamples = 22\nvalue = [5, 17]\nclass = aroused'),
 Text(470.37184115523473, 269.082, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]\nclass = aroused'),
 Text(474.4007220216607, 269.082, 'familiarity <= 0.524\ngini = 0.473\nsamples = 13\nvalue = [5, 8]\nclass = aroused'),
 Text(472.3862815884477, 252.774, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(476.41516245487367, 252.774, 'familiarity <= 0.63\ngini = 0.408\nsamples = 7\nvalue = [5, 2]\nclass = not aroused'),
 Text(474.4007220216607, 236.46599999999998, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(478.4296028880867, 236.46599999999998, 'masculinity <= 0.541\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(476.41516245487367, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(480.44404332129966, 220.158, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(478.4296028880867, 301.698, 'length <= 4.5\ngini = 0.298\nsamples = 33\nvalue = [6, 27]\nclass = aroused'),
 Text(476.41516245487367, 285.39, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(480.44404332129966, 285.39, 'masculinity <= 0.822\ngini = 0.18\nsamples = 30\nvalue = [3, 27]\nclass = aroused'),
 Text(478.4296028880867, 269.082, 'gini = 0.0\nsamples = 24\nvalue = [0, 24]\nclass = aroused'),
 Text(482.4584837545127, 269.082, 'perceivability <= 0.626\ngini = 0.5\nsamples = 6\nvalue = [3, 3]\nclass = not aroused'),
 Text(480.44404332129966, 252.774, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(484.47292418772565, 252.774, 'perceivability <= 0.751\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = aroused'),
 Text(482.4584837545127, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(486.4873646209387, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(524.1007445848376, 334.31399999999996, 'valence <= 0.645\ngini = 0.319\nsamples = 216\nvalue = [173, 43]\nclass = not aroused'),
 Text(507.3242328519856, 318.006, 'semsize <= 0.887\ngini = 0.246\nsamples = 181\nvalue = [155, 26]\nclass = not aroused'),
 Text(499.707129963899, 301.698, 'masculinity <= 0.345\ngini = 0.204\nsamples = 173\nvalue = [153, 20]\nclass = not aroused'),
 Text(490.5162454873647, 285.39, 'semsize <= 0.647\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = aroused'),
 Text(488.50180505415165, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(492.53068592057764, 269.082, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'),
 Text(508.89801444043326, 285.39, 'perceivability <= 0.225\ngini = 0.172\nsamples = 168\nvalue = [152, 16]\nclass = not aroused'),
 Text(496.55956678700363, 269.082, 'semsize <= 0.729\ngini = 0.444\nsamples = 18\nvalue = [12, 6]\nclass = not aroused'),
 Text(492.53068592057764, 252.774, 'dominance <= 0.59\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'),
 Text(490.5162454873647, 236.46599999999998, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'),
 Text(494.5451263537907, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(500.5884476534296, 252.774, 'aoa <= 0.291\ngini = 0.469\nsamples = 8\nvalue = [3, 5]\nclass = aroused'),
 Text(498.5740072202167, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(502.6028880866427, 236.46599999999998, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'),
 Text(521.2364620938629, 269.082, 'semsize <= 0.783\ngini = 0.124\nsamples = 150\nvalue = [140, 10]\nclass = not aroused'),
 Text(513.6823104693142, 252.774, 'semsize <= 0.677\ngini = 0.082\nsamples = 116\nvalue = [111, 5]\nclass = not aroused'),
 Text(506.6317689530687, 236.46599999999998, 'dominance <= 0.504\ngini = 0.18\nsamples = 40\nvalue = [36, 4]\nclass = not aroused'),
 Text(502.6028880866427, 220.158, 'perceivability <= 0.937\ngini = 0.064\nsamples = 30\nvalue = [29, 1]\nclass = not aroused'),
 Text(500.5884476534296, 203.85, 'gini = 0.0\nsamples = 27\nvalue = [27, 0]\nclass = not aroused'),
 Text(504.6173285198556, 203.85, 'valence <= 0.501\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(502.6028880866427, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(506.6317689530687, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(510.66064981949467, 220.158, 'length <= 6.5\ngini = 0.42\nsamples = 10\nvalue = [7, 3]\nclass = not aroused'),
 Text(508.6462093862816, 203.85, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'),
 Text(512.6750902527076, 203.85, 'valence <= 0.628\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = aroused'),
 Text(510.66064981949467, 187.542, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(514.6895306859207, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(520.7328519855596, 236.46599999999998, 'valence <= 0.468\ngini = 0.026\nsamples = 76\nvalue = [75, 1]\nclass = not aroused'),
 Text(518.7184115523467, 220.158, 'valence <= 0.46\ngini = 0.219\nsamples = 8\nvalue = [7, 1]\nclass = not aroused'),
 Text(516.7039711191336, 203.85, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = not aroused'),
 Text(520.7328519855596, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(522.7472924187726, 220.158, 'gini = 0.0\nsamples = 68\nvalue = [68, 0]\nclass = not aroused'),
 Text(528.7906137184116, 252.774, 'semsize <= 0.785\ngini = 0.251\nsamples = 34\nvalue = [29, 5]\nclass = not aroused'),
 Text(526.7761732851986, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(530.8050541516246, 236.46599999999998, 'length <= 4.5\ngini = 0.17\nsamples = 32\nvalue = [29, 3]\nclass = not aroused'),
 Text(526.7761732851986, 220.158, 'semsize <= 0.819\ngini = 0.5\nsamples = 4\nvalue = [2, 2]\nclass = not aroused'),
 Text(524.7617328519856, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(528.7906137184116, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(534.8339350180506, 220.158, 'length <= 9.5\ngini = 0.069\nsamples = 28\nvalue = [27, 1]\nclass = not aroused'),
 Text(532.8194945848376, 203.85, 'gini = 0.0\nsamples = 24\nvalue = [24, 0]\nclass = not aroused'),
 Text(536.8483754512636, 203.85, 'semsize <= 0.827\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'),
 Text(534.8339350180506, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(538.8628158844766, 187.542, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(514.9413357400723, 301.698, 'dominance <= 0.452\ngini = 0.375\nsamples = 8\nvalue = [2, 6]\nclass = aroused'),
 Text(512.9268953068593, 285.39, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(516.9557761732852, 285.39, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(540.8772563176896, 318.006, 'familiarity <= 0.873\ngini = 0.5\nsamples = 35\nvalue = [18, 17]\nclass = not aroused'),
 Text(538.8628158844766, 301.698, 'dominance <= 0.441\ngini = 0.471\nsamples = 29\nvalue = [18, 11]\nclass = not aroused'),
 Text(532.8194945848376, 285.39, 'length <= 7.5\ngini = 0.346\nsamples = 9\nvalue = [2, 7]\nclass = aroused'),
 Text(530.8050541516246, 269.082, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(534.8339350180506, 269.082, 'length <= 9.5\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(532.8194945848376, 252.774, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(536.8483754512636, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(544.9061371841156, 285.39, 'masculinity <= 0.708\ngini = 0.32\nsamples = 20\nvalue = [16, 4]\nclass = not aroused'),
 Text(542.8916967509026, 269.082, 'masculinity <= 0.408\ngini = 0.198\nsamples = 18\nvalue = [16, 2]\nclass = not aroused'),
 Text(540.8772563176896, 252.774, 'valence <= 0.662\ngini = 0.5\nsamples = 4\nvalue = [2, 2]\nclass = not aroused'),
 Text(538.8628158844766, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(542.8916967509026, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(544.9061371841156, 252.774, 'gini = 0.0\nsamples = 14\nvalue = [14, 0]\nclass = not aroused'),
 Text(546.9205776173286, 269.082, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(542.8916967509026, 301.698, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(568.072202166065, 350.62199999999996, 'valence <= 0.687\ngini = 0.42\nsamples = 70\nvalue = [21, 49]\nclass = aroused'),
 Text(561.0216606498195, 334.31399999999996, 'perceivability <= 0.248\ngini = 0.379\nsamples = 63\nvalue = [16, 47]\nclass = aroused'),
 Text(556.9927797833935, 318.006, 'aoa <= 0.774\ngini = 0.496\nsamples = 11\nvalue = [6, 5]\nclass = not aroused'),
 Text(554.9783393501806, 301.698, 'semsize <= 0.677\ngini = 0.408\nsamples = 7\nvalue = [2, 5]\nclass = aroused'),
 Text(552.9638989169675, 285.39, 'masculinity <= 0.467\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(550.9494584837546, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(554.9783393501806, 269.082, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(556.9927797833935, 285.39, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'),
 Text(559.0072202166066, 301.698, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(565.0505415162455, 318.006, 'valence <= 0.601\ngini = 0.311\nsamples = 52\nvalue = [10, 42]\nclass = aroused'),
 Text(563.0361010830326, 301.698, 'gini = 0.0\nsamples = 19\nvalue = [0, 19]\nclass = aroused'),
 Text(567.0649819494586, 301.698, 'valence <= 0.612\ngini = 0.422\nsamples = 33\nvalue = [10, 23]\nclass = aroused'),
 Text(565.0505415162455, 285.39, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(569.0794223826715, 285.39, 'length <= 9.5\ngini = 0.383\nsamples = 31\nvalue = [8, 23]\nclass = aroused'),
 Text(564.043321299639, 269.082, 'masculinity <= 0.773\ngini = 0.269\nsamples = 25\nvalue = [4, 21]\nclass = aroused'),
 Text(560.014440433213, 252.774, 'semsize <= 0.658\ngini = 0.165\nsamples = 22\nvalue = [2, 20]\nclass = aroused'),
 Text(558.0, 236.46599999999998, 'familiarity <= 0.692\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = aroused'),
 Text(555.9855595667871, 220.158, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(560.014440433213, 220.158, 'web_corpus_freq <= 7.0\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(558.0, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(562.0288808664261, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(562.0288808664261, 236.46599999999998, 'gini = 0.0\nsamples = 16\nvalue = [0, 16]\nclass = aroused'),
 Text(568.072202166065, 252.774, 'perceivability <= 0.869\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(566.057761732852, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(570.0866425992781, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(574.115523465704, 269.082, 'masculinity <= 0.666\ngini = 0.444\nsamples = 6\nvalue = [4, 2]\nclass = not aroused'),
 Text(572.101083032491, 252.774, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(576.129963898917, 252.774, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(575.1227436823106, 334.31399999999996, 'masculinity <= 0.469\ngini = 0.408\nsamples = 7\nvalue = [5, 2]\nclass = not aroused'),
 Text(573.1083032490975, 318.006, 'dominance <= 0.653\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(571.0938628158846, 301.698, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(575.1227436823106, 301.698, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(577.1371841155235, 318.006, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(714.209626015343, 383.238, 'dominance <= 0.585\ngini = 0.448\nsamples = 682\nvalue = [231, 451]\nclass = aroused'),
 Text(656.2590534747293, 366.93, 'semsize <= 0.725\ngini = 0.494\nsamples = 269\nvalue = [149, 120]\nclass = not aroused'),
 Text(612.5000564079423, 350.62199999999996, 'masculinity <= 0.189\ngini = 0.471\nsamples = 219\nvalue = [136, 83]\nclass = not aroused'),
 Text(585.1949458483755, 334.31399999999996, 'masculinity <= 0.108\ngini = 0.482\nsamples = 32\nvalue = [13, 19]\nclass = aroused'),
 Text(581.1660649819495, 318.006, 'web_corpus_freq <= 6.5\ngini = 0.457\nsamples = 17\nvalue = [11, 6]\nclass = not aroused'),
 Text(579.1516245487365, 301.698, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'),
 Text(583.1805054151625, 301.698, 'aoa <= 0.239\ngini = 0.496\nsamples = 11\nvalue = [5, 6]\nclass = aroused'),
 Text(581.1660649819495, 285.39, 'familiarity <= 0.772\ngini = 0.408\nsamples = 7\nvalue = [5, 2]\nclass = not aroused'),
 Text(579.1516245487365, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(583.1805054151625, 269.082, 'masculinity <= 0.026\ngini = 0.278\nsamples = 6\nvalue = [5, 1]\nclass = not aroused'),
 Text(581.1660649819495, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(585.1949458483755, 252.774, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'),
 Text(585.1949458483755, 285.39, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'),
 Text(589.2238267148015, 318.006, 'familiarity <= 0.619\ngini = 0.231\nsamples = 15\nvalue = [2, 13]\nclass = aroused'),
 Text(587.2093862815885, 301.698, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(591.2382671480145, 301.698, 'gini = 0.0\nsamples = 13\nvalue = [0, 13]\nclass = aroused'),
 Text(639.8051669675091, 334.31399999999996, 'aoa <= 0.453\ngini = 0.45\nsamples = 187\nvalue = [123, 64]\nclass = not aroused'),
 Text(613.0823555956679, 318.006, 'perceivability <= 0.428\ngini = 0.487\nsamples = 105\nvalue = [61, 44]\nclass = not aroused'),
 Text(595.2671480144405, 301.698, 'masculinity <= 0.405\ngini = 0.337\nsamples = 28\nvalue = [22, 6]\nclass = not aroused'),
 Text(593.2527075812275, 285.39, 'gini = 0.0\nsamples = 16\nvalue = [16, 0]\nclass = not aroused'),
 Text(597.2815884476535, 285.39, 'valence <= 0.712\ngini = 0.5\nsamples = 12\nvalue = [6, 6]\nclass = not aroused'),
 Text(595.2671480144405, 269.082, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(599.2960288808665, 269.082, 'masculinity <= 0.468\ngini = 0.444\nsamples = 9\nvalue = [3, 6]\nclass = aroused'),
 Text(597.2815884476535, 252.774, 'length <= 3.5\ngini = 0.245\nsamples = 7\nvalue = [1, 6]\nclass = aroused'),
 Text(595.2671480144405, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(599.2960288808665, 236.46599999999998, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(601.3104693140795, 252.774, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(630.8975631768953, 301.698, 'valence <= 0.782\ngini = 0.5\nsamples = 77\nvalue = [39, 38]\nclass = not aroused'),
 Text(616.1669675090253, 285.39, 'perceivability <= 0.581\ngini = 0.481\nsamples = 57\nvalue = [34, 23]\nclass = not aroused'),
 Text(607.3537906137185, 269.082, 'semsize <= 0.493\ngini = 0.346\nsamples = 9\nvalue = [2, 7]\nclass = aroused'),
 Text(605.3393501805054, 252.774, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(609.3682310469314, 252.774, 'semsize <= 0.55\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(607.3537906137185, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(611.3826714801445, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(624.9801444043322, 269.082, 'masculinity <= 0.323\ngini = 0.444\nsamples = 48\nvalue = [32, 16]\nclass = not aroused'),
 Text(617.4259927797834, 252.774, 'familiarity <= 0.914\ngini = 0.208\nsamples = 17\nvalue = [15, 2]\nclass = not aroused'),
 Text(615.4115523465705, 236.46599999999998, 'polysemy <= 0.5\ngini = 0.117\nsamples = 16\nvalue = [15, 1]\nclass = not aroused'),
 Text(613.3971119133574, 220.158, 'gini = 0.0\nsamples = 14\nvalue = [14, 0]\nclass = not aroused'),
 Text(617.4259927797834, 220.158, 'familiarity <= 0.863\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(615.4115523465705, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(619.4404332129965, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(619.4404332129965, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(632.5342960288809, 252.774, 'semsize <= 0.329\ngini = 0.495\nsamples = 31\nvalue = [17, 14]\nclass = not aroused'),
 Text(630.5198555956679, 236.46599999999998, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'),
 Text(634.548736462094, 236.46599999999998, 'semsize <= 0.578\ngini = 0.466\nsamples = 27\nvalue = [17, 10]\nclass = not aroused'),
 Text(627.4981949458485, 220.158, 'dominance <= 0.578\ngini = 0.32\nsamples = 15\nvalue = [12, 3]\nclass = not aroused'),
 Text(623.4693140794225, 203.85, 'perceivability <= 0.969\ngini = 0.153\nsamples = 12\nvalue = [11, 1]\nclass = not aroused'),
 Text(621.4548736462094, 187.542, 'gini = 0.0\nsamples = 11\nvalue = [11, 0]\nclass = not aroused'),
 Text(625.4837545126354, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(631.5270758122745, 203.85, 'semsize <= 0.422\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'),
 Text(629.5126353790614, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(633.5415162454874, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(641.5992779783394, 220.158, 'aoa <= 0.312\ngini = 0.486\nsamples = 12\nvalue = [5, 7]\nclass = aroused'),
 Text(639.5848375451264, 203.85, 'perceivability <= 0.914\ngini = 0.408\nsamples = 7\nvalue = [5, 2]\nclass = not aroused'),
 Text(637.5703971119134, 187.542, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'),
 Text(641.5992779783394, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(643.6137184115524, 203.85, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'),
 Text(645.6281588447654, 285.39, 'semsize <= 0.444\ngini = 0.375\nsamples = 20\nvalue = [5, 15]\nclass = aroused'),
 Text(643.6137184115524, 269.082, 'gini = 0.0\nsamples = 7\nvalue = [0, 7]\nclass = aroused'),
 Text(647.6425992779784, 269.082, 'familiarity <= 0.885\ngini = 0.473\nsamples = 13\nvalue = [5, 8]\nclass = aroused'),
 Text(645.6281588447654, 252.774, 'perceivability <= 0.494\ngini = 0.32\nsamples = 10\nvalue = [2, 8]\nclass = aroused'),
 Text(643.6137184115524, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(647.6425992779784, 236.46599999999998, 'dominance <= 0.57\ngini = 0.198\nsamples = 9\nvalue = [1, 8]\nclass = aroused'),
 Text(645.6281588447654, 220.158, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = aroused'),
 Text(649.6570397111914, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(649.6570397111914, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(666.5279783393503, 318.006, 'valence <= 0.776\ngini = 0.369\nsamples = 82\nvalue = [62, 20]\nclass = not aroused'),
 Text(657.7148014440434, 301.698, 'masculinity <= 0.219\ngini = 0.136\nsamples = 41\nvalue = [38, 3]\nclass = not aroused'),
 Text(653.6859205776174, 285.39, 'web_corpus_freq <= 5.5\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(651.6714801444044, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(655.7003610108304, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(661.7436823104694, 285.39, 'length <= 5.5\ngini = 0.097\nsamples = 39\nvalue = [37, 2]\nclass = not aroused'),
 Text(659.7292418772564, 269.082, 'valence <= 0.721\ngini = 0.26\nsamples = 13\nvalue = [11, 2]\nclass = not aroused'),
 Text(657.7148014440434, 252.774, 'gini = 0.0\nsamples = 8\nvalue = [8, 0]\nclass = not aroused'),
 Text(661.7436823104694, 252.774, 'length <= 4.5\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'),
 Text(659.7292418772564, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(663.7581227436824, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(663.7581227436824, 269.082, 'gini = 0.0\nsamples = 26\nvalue = [26, 0]\nclass = not aroused'),
 Text(675.3411552346571, 301.698, 'perceivability <= 0.222\ngini = 0.485\nsamples = 41\nvalue = [24, 17]\nclass = not aroused'),
 Text(669.8014440433213, 285.39, 'familiarity <= 0.774\ngini = 0.278\nsamples = 6\nvalue = [1, 5]\nclass = aroused'),
 Text(667.7870036101084, 269.082, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'),
 Text(671.8158844765344, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(680.8808664259928, 285.39, 'perceivability <= 0.296\ngini = 0.451\nsamples = 35\nvalue = [23, 12]\nclass = not aroused'),
 Text(675.8447653429604, 269.082, 'semsize <= 0.686\ngini = 0.165\nsamples = 11\nvalue = [10, 1]\nclass = not aroused'),
 Text(673.8303249097473, 252.774, 'gini = 0.0\nsamples = 10\nvalue = [10, 0]\nclass = not aroused'),
 Text(677.8592057761733, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(685.9169675090253, 269.082, 'dominance <= 0.467\ngini = 0.497\nsamples = 24\nvalue = [13, 11]\nclass = not aroused'),
 Text(681.8880866425993, 252.774, 'familiarity <= 0.787\ngini = 0.219\nsamples = 8\nvalue = [7, 1]\nclass = not aroused'),
 Text(679.8736462093864, 236.46599999999998, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = not aroused'),
 Text(683.9025270758124, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(689.9458483754513, 252.774, 'semsize <= 0.676\ngini = 0.469\nsamples = 16\nvalue = [6, 10]\nclass = aroused'),
 Text(687.9314079422384, 236.46599999999998, 'aoa <= 0.491\ngini = 0.355\nsamples = 13\nvalue = [3, 10]\nclass = aroused'),
 Text(683.9025270758124, 220.158, 'web_corpus_freq <= 6.5\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(681.8880866425993, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(685.9169675090253, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(691.9602888086644, 220.158, 'length <= 10.5\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = aroused'),
 Text(689.9458483754513, 203.85, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]\nclass = aroused'),
 Text(693.9747292418773, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(691.9602888086644, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(700.0180505415163, 350.62199999999996, 'valence <= 0.84\ngini = 0.385\nsamples = 50\nvalue = [13, 37]\nclass = aroused'),
 Text(698.0036101083033, 334.31399999999996, 'semsize <= 0.868\ngini = 0.478\nsamples = 33\nvalue = [13, 20]\nclass = aroused'),
 Text(695.9891696750904, 318.006, 'dominance <= 0.522\ngini = 0.497\nsamples = 24\nvalue = [13, 11]\nclass = not aroused'),
 Text(689.9458483754513, 301.698, 'familiarity <= 0.817\ngini = 0.355\nsamples = 13\nvalue = [10, 3]\nclass = not aroused'),
 Text(687.9314079422384, 285.39, 'gini = 0.0\nsamples = 8\nvalue = [8, 0]\nclass = not aroused'),
 Text(691.9602888086644, 285.39, 'dominance <= 0.5\ngini = 0.48\nsamples = 5\nvalue = [2, 3]\nclass = aroused'),
 Text(689.9458483754513, 269.082, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(693.9747292418773, 269.082, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(702.0324909747293, 301.698, 'dominance <= 0.573\ngini = 0.397\nsamples = 11\nvalue = [3, 8]\nclass = aroused'),
 Text(700.0180505415163, 285.39, 'familiarity <= 0.934\ngini = 0.198\nsamples = 9\nvalue = [1, 8]\nclass = aroused'),
 Text(698.0036101083033, 269.082, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = aroused'),
 Text(702.0324909747293, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(704.0469314079423, 285.39, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(700.0180505415163, 318.006, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]\nclass = aroused'),
 Text(702.0324909747293, 334.31399999999996, 'gini = 0.0\nsamples = 17\nvalue = [0, 17]\nclass = aroused'),
 Text(772.1601985559568, 366.93, 'valence <= 0.801\ngini = 0.318\nsamples = 413\nvalue = [82, 331]\nclass = aroused'),
 Text(734.1376353790614, 350.62199999999996, 'perceivability <= 0.326\ngini = 0.464\nsamples = 150\nvalue = [55, 95]\nclass = aroused'),
 Text(713.6155234657041, 334.31399999999996, 'perceivability <= 0.146\ngini = 0.469\nsamples = 56\nvalue = [35, 21]\nclass = not aroused'),
 Text(708.0758122743683, 318.006, 'masculinity <= 0.39\ngini = 0.245\nsamples = 7\nvalue = [1, 6]\nclass = aroused'),
 Text(706.0613718411553, 301.698, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(710.0902527075813, 301.698, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(719.1552346570397, 318.006, 'masculinity <= 0.531\ngini = 0.425\nsamples = 49\nvalue = [34, 15]\nclass = not aroused'),
 Text(714.1191335740073, 301.698, 'familiarity <= 0.571\ngini = 0.298\nsamples = 33\nvalue = [27, 6]\nclass = not aroused'),
 Text(712.1046931407943, 285.39, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(716.1335740072203, 285.39, 'polysemy <= 0.5\ngini = 0.18\nsamples = 30\nvalue = [27, 3]\nclass = not aroused'),
 Text(714.1191335740073, 269.082, 'familiarity <= 0.902\ngini = 0.128\nsamples = 29\nvalue = [27, 2]\nclass = not aroused'),
 Text(710.0902527075813, 252.774, 'semsize <= 0.518\ngini = 0.071\nsamples = 27\nvalue = [26, 1]\nclass = not aroused'),
 Text(708.0758122743683, 236.46599999999998, 'perceivability <= 0.223\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'),
 Text(706.0613718411553, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(710.0902527075813, 220.158, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(712.1046931407943, 236.46599999999998, 'gini = 0.0\nsamples = 22\nvalue = [22, 0]\nclass = not aroused'),
 Text(718.1480144404333, 252.774, 'perceivability <= 0.202\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(716.1335740072203, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(720.1624548736463, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(718.1480144404333, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(724.1913357400723, 301.698, 'semsize <= 0.54\ngini = 0.492\nsamples = 16\nvalue = [7, 9]\nclass = aroused'),
 Text(722.1768953068593, 285.39, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(726.2057761732852, 285.39, 'perceivability <= 0.306\ngini = 0.426\nsamples = 13\nvalue = [4, 9]\nclass = aroused'),
 Text(724.1913357400723, 269.082, 'valence <= 0.704\ngini = 0.298\nsamples = 11\nvalue = [2, 9]\nclass = aroused'),
 Text(722.1768953068593, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(726.2057761732852, 252.774, 'perceivability <= 0.187\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = aroused'),
 Text(724.1913357400723, 236.46599999999998, 'masculinity <= 0.606\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(722.1768953068593, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(726.2057761732852, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(728.2202166064983, 236.46599999999998, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = aroused'),
 Text(728.2202166064983, 269.082, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(754.6597472924188, 334.31399999999996, 'masculinity <= 0.049\ngini = 0.335\nsamples = 94\nvalue = [20, 74]\nclass = aroused'),
 Text(752.6453068592059, 318.006, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(756.6741877256319, 318.006, 'length <= 8.5\ngini = 0.315\nsamples = 92\nvalue = [18, 74]\nclass = aroused'),
 Text(740.8104693140795, 301.698, 'perceivability <= 0.416\ngini = 0.26\nsamples = 78\nvalue = [12, 66]\nclass = aroused'),
 Text(734.2635379061372, 285.39, 'familiarity <= 0.852\ngini = 0.435\nsamples = 25\nvalue = [8, 17]\nclass = aroused'),
 Text(732.2490974729243, 269.082, 'masculinity <= 0.525\ngini = 0.308\nsamples = 21\nvalue = [4, 17]\nclass = aroused'),
 Text(730.2346570397112, 252.774, 'gini = 0.0\nsamples = 11\nvalue = [0, 11]\nclass = aroused'),
 Text(734.2635379061372, 252.774, 'masculinity <= 0.536\ngini = 0.48\nsamples = 10\nvalue = [4, 6]\nclass = aroused'),
 Text(732.2490974729243, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(736.2779783393503, 236.46599999999998, 'length <= 4.5\ngini = 0.375\nsamples = 8\nvalue = [2, 6]\nclass = aroused'),
 Text(734.2635379061372, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(738.2924187725632, 220.158, 'familiarity <= 0.301\ngini = 0.245\nsamples = 7\nvalue = [1, 6]\nclass = aroused'),
 Text(736.2779783393503, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(740.3068592057763, 203.85, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(736.2779783393503, 269.082, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(747.3574007220218, 285.39, 'masculinity <= 0.251\ngini = 0.14\nsamples = 53\nvalue = [4, 49]\nclass = aroused'),
 Text(740.3068592057763, 269.082, 'masculinity <= 0.19\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = aroused'),
 Text(738.2924187725632, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(742.3212996389892, 252.774, 'web_corpus_freq <= 6.5\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'),
 Text(740.3068592057763, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(744.3357400722023, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(754.4079422382672, 269.082, 'perceivability <= 0.958\ngini = 0.081\nsamples = 47\nvalue = [2, 45]\nclass = aroused'),
 Text(750.3790613718412, 252.774, 'masculinity <= 0.383\ngini = 0.043\nsamples = 45\nvalue = [1, 44]\nclass = aroused'),
 Text(748.3646209386283, 236.46599999999998, 'masculinity <= 0.379\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = aroused'),
 Text(746.3501805054152, 220.158, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = aroused'),
 Text(750.3790613718412, 220.158, 'length <= 5.0\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(748.3646209386283, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(752.3935018050543, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(752.3935018050543, 236.46599999999998, 'gini = 0.0\nsamples = 35\nvalue = [0, 35]\nclass = aroused'),
 Text(758.4368231046932, 252.774, 'polysemy <= 0.5\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(756.4223826714802, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(760.4512635379062, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(772.5379061371842, 301.698, 'perceivability <= 0.793\ngini = 0.49\nsamples = 14\nvalue = [6, 8]\nclass = aroused'),
 Text(770.5234657039712, 285.39, 'masculinity <= 0.593\ngini = 0.444\nsamples = 12\nvalue = [4, 8]\nclass = aroused'),
 Text(768.5090252707582, 269.082, 'familiarity <= 0.769\ngini = 0.5\nsamples = 8\nvalue = [4, 4]\nclass = not aroused'),
 Text(766.4945848375452, 252.774, 'dominance <= 0.602\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'),
 Text(764.4801444043322, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(768.5090252707582, 236.46599999999998, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(770.5234657039712, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(772.5379061371842, 269.082, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'),
 Text(774.5523465703972, 285.39, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(810.1827617328521, 350.62199999999996, 'dominance <= 0.663\ngini = 0.184\nsamples = 263\nvalue = [27, 236]\nclass = aroused'),
 Text(789.408844765343, 334.31399999999996, 'perceivability <= 0.25\ngini = 0.332\nsamples = 95\nvalue = [20, 75]\nclass = aroused'),
 Text(782.6101083032491, 318.006, 'valence <= 0.896\ngini = 0.5\nsamples = 20\nvalue = [10, 10]\nclass = not aroused'),
 Text(780.5956678700362, 301.698, 'perceivability <= 0.212\ngini = 0.408\nsamples = 14\nvalue = [10, 4]\nclass = not aroused'),
 Text(778.5812274368232, 285.39, 'masculinity <= 0.369\ngini = 0.5\nsamples = 8\nvalue = [4, 4]\nclass = not aroused'),
 Text(776.5667870036102, 269.082, 'valence <= 0.857\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'),
 Text(774.5523465703972, 252.774, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'),
 Text(778.5812274368232, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(780.5956678700362, 269.082, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'),
 Text(782.6101083032491, 285.39, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'),
 Text(784.6245487364622, 301.698, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'),
 Text(796.2075812274369, 318.006, 'perceivability <= 0.451\ngini = 0.231\nsamples = 75\nvalue = [10, 65]\nclass = aroused'),
 Text(794.193140794224, 301.698, 'aoa <= 0.463\ngini = 0.34\nsamples = 46\nvalue = [10, 36]\nclass = aroused'),
 Text(786.6389891696751, 285.39, 'aoa <= 0.397\ngini = 0.497\nsamples = 13\nvalue = [6, 7]\nclass = aroused'),
 Text(784.6245487364622, 269.082, 'perceivability <= 0.349\ngini = 0.42\nsamples = 10\nvalue = [3, 7]\nclass = aroused'),
 Text(782.6101083032491, 252.774, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'),
 Text(786.6389891696751, 252.774, 'aoa <= 0.273\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'),
 Text(784.6245487364622, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(788.6534296028882, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'),
 Text(788.6534296028882, 269.082, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'),
 Text(801.7472924187726, 285.39, 'length <= 12.5\ngini = 0.213\nsamples = 33\nvalue = [4, 29]\nclass = aroused'),
 Text(799.7328519855596, 269.082, 'masculinity <= 0.168\ngini = 0.17\nsamples = 32\nvalue = [3, 29]\nclass = aroused'),
 Text(794.6967509025271, 252.774, 'familiarity <= 0.622\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(792.6823104693142, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(796.7111913357402, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(804.7689530685922, 252.774, 'dominance <= 0.596\ngini = 0.124\nsamples = 30\nvalue = [2, 28]\nclass = aroused'),
 Text(800.7400722021662, 236.46599999999998, 'perceivability <= 0.349\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(798.7256317689531, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(802.7545126353791, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(808.7978339350182, 236.46599999999998, 'length <= 9.0\ngini = 0.069\nsamples = 28\nvalue = [1, 27]\nclass = aroused'),
 Text(806.7833935018051, 220.158, 'gini = 0.0\nsamples = 23\nvalue = [0, 23]\nclass = aroused'),
 Text(810.8122743682311, 220.158, 'valence <= 0.907\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = aroused'),
 Text(808.7978339350182, 203.85, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'),
 Text(812.8267148014442, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(803.7617328519857, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(798.22202166065, 301.698, 'gini = 0.0\nsamples = 29\nvalue = [0, 29]\nclass = aroused'),
 Text(830.9566787003611, 334.31399999999996, 'familiarity <= 0.404\ngini = 0.08\nsamples = 168\nvalue = [7, 161]\nclass = aroused'),
 Text(828.9422382671481, 318.006, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(832.9711191335741, 318.006, 'semsize <= 0.649\ngini = 0.069\nsamples = 167\nvalue = [6, 161]\nclass = aroused'),
 Text(830.9566787003611, 301.698, 'semsize <= 0.649\ngini = 0.219\nsamples = 48\nvalue = [6, 42]\nclass = aroused'),
 Text(828.9422382671481, 285.39, 'dominance <= 0.812\ngini = 0.19\nsamples = 47\nvalue = [5, 42]\nclass = aroused'),
 Text(826.9277978339351, 269.082, 'familiarity <= 0.901\ngini = 0.159\nsamples = 46\nvalue = [4, 42]\nclass = aroused'),
 Text(822.8989169675091, 252.774, 'perceivability <= 0.311\ngini = 0.097\nsamples = 39\nvalue = [2, 37]\nclass = aroused'),
 Text(820.8844765342961, 236.46599999999998, 'perceivability <= 0.303\ngini = 0.245\nsamples = 14\nvalue = [2, 12]\nclass = aroused'),
 Text(818.8700361010831, 220.158, 'length <= 8.0\ngini = 0.142\nsamples = 13\nvalue = [1, 12]\nclass = aroused'),
 Text(816.8555956678701, 203.85, 'gini = 0.0\nsamples = 11\nvalue = [0, 11]\nclass = aroused'),
 Text(820.8844765342961, 203.85, 'aoa <= 0.65\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'),
 Text(818.8700361010831, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'),
 Text(822.8989169675091, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(822.8989169675091, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(824.9133574007221, 236.46599999999998, 'gini = 0.0\nsamples = 25\nvalue = [0, 25]\nclass = aroused'),
 Text(830.9566787003611, 252.774, 'aoa <= 0.283\ngini = 0.408\nsamples = 7\nvalue = [2, 5]\nclass = aroused'),
 Text(828.9422382671481, 236.46599999999998, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'),
 Text(832.9711191335741, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'),
 Text(830.9566787003611, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(832.9711191335741, 285.39, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'),
 Text(834.9855595667871, 301.698, 'gini = 0.0\nsamples = 119\nvalue = [0, 119]\nclass = aroused')]
In [ ]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score

plot_confusion_matrix(clf_dt, X_test, y_test, display_labels=["not aroused","aroused"])
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)
Out[ ]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f87310de210>
In [ ]:
y_pred = clf_dt.predict(X_train)
y_pred = clf_dt.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

y_score = clf_dt.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
Accuracy 0.7719897523484202
F1-score [0.8441331  0.57551669]
              precision    recall  f1-score   support

         0.0       0.85      0.83      0.84       867
         1.0       0.56      0.60      0.58       304

    accuracy                           0.77      1171
   macro avg       0.71      0.71      0.71      1171
weighted avg       0.78      0.77      0.77      1171

0.7146523857220907
In [ ]:
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)
In [ ]:
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
In [ ]:
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=0.003)

scores= cross_val_score(clf_dt,X_train,y_train, cv=10)

df=pd.DataFrame(data={'tree':range(10), 'accuracy':scores})

df.plot(x='tree', y='accuracy',marker='o',linestyle='--')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f87324c1410>
In [ ]:
alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   marker='o',
                   linestyle='--')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f87320437d0>
In [ ]:
alpha_results[(alpha_results['alpha']>0.0025)
&
(alpha_results['alpha']<0.005)]
Out[ ]:
alpha mean_accuracy std
201 0.002712 0.825980 0.012110
202 0.002828 0.824838 0.009346
203 0.003022 0.823699 0.010039
204 0.003397 0.819710 0.015653
205 0.003625 0.819710 0.014745
206 0.003792 0.819425 0.014533
207 0.004665 0.817431 0.014101
In [ ]:
ideal_ccp_alpha = 0.003242
In [ ]:
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
In [ ]:
plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['not aroused','aroused'])
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)
Out[ ]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f8732590310>
In [ ]:
plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["not aroused","aroused"],
          feature_names=X.columns)
Out[ ]:
[Text(453.375, 366.93, 'valence <= 0.698\ngini = 0.372\nsamples = 3511\nvalue = [2645, 866]\nclass = not aroused'),
 Text(279.0, 285.39, 'semsize <= 0.638\ngini = 0.25\nsamples = 2829\nvalue = [2414, 415]\nclass = not aroused'),
 Text(139.5, 203.85000000000002, 'semsize <= 0.496\ngini = 0.164\nsamples = 2299\nvalue = [2092, 207]\nclass = not aroused'),
 Text(69.75, 122.31, 'gini = 0.101\nsamples = 1542\nvalue = [1460, 82]\nclass = not aroused'),
 Text(209.25, 122.31, 'gini = 0.276\nsamples = 757\nvalue = [632, 125]\nclass = not aroused'),
 Text(418.5, 203.85000000000002, 'dominance <= 0.608\ngini = 0.477\nsamples = 530\nvalue = [322, 208]\nclass = not aroused'),
 Text(348.75, 122.31, 'valence <= 0.402\ngini = 0.452\nsamples = 460\nvalue = [301, 159]\nclass = not aroused'),
 Text(279.0, 40.77000000000004, 'gini = 0.499\nsamples = 244\nvalue = [128, 116]\nclass = not aroused'),
 Text(418.5, 40.77000000000004, 'gini = 0.319\nsamples = 216\nvalue = [173, 43]\nclass = not aroused'),
 Text(488.25, 122.31, 'gini = 0.42\nsamples = 70\nvalue = [21, 49]\nclass = aroused'),
 Text(627.75, 285.39, 'dominance <= 0.585\ngini = 0.448\nsamples = 682\nvalue = [231, 451]\nclass = aroused'),
 Text(558.0, 203.85000000000002, 'gini = 0.494\nsamples = 269\nvalue = [149, 120]\nclass = not aroused'),
 Text(697.5, 203.85000000000002, 'valence <= 0.801\ngini = 0.318\nsamples = 413\nvalue = [82, 331]\nclass = aroused'),
 Text(627.75, 122.31, 'perceivability <= 0.326\ngini = 0.464\nsamples = 150\nvalue = [55, 95]\nclass = aroused'),
 Text(558.0, 40.77000000000004, 'gini = 0.469\nsamples = 56\nvalue = [35, 21]\nclass = not aroused'),
 Text(697.5, 40.77000000000004, 'gini = 0.335\nsamples = 94\nvalue = [20, 74]\nclass = aroused'),
 Text(767.25, 122.31, 'gini = 0.184\nsamples = 263\nvalue = [27, 236]\nclass = aroused')]
In [ ]:
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average='weighted'))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
Accuracy 0.8129803586678053
F1-score 0.7877208681457515
              precision    recall  f1-score   support

         0.0       0.82      0.97      0.88       867
         1.0       0.79      0.38      0.51       304

    accuracy                           0.81      1171
   macro avg       0.80      0.67      0.70      1171
weighted avg       0.81      0.81      0.79      1171

0.8427881988708796

Valence¶

In [ ]:
refvar="valence"
taglio=0.67

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)
In [ ]:
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
In [ ]:
alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   marker='o',
                   linestyle='--')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f87312c8e90>
In [ ]:
alpha_results[(alpha_results['alpha']>0.0018)
&
(alpha_results['alpha']<0.0028)]
Out[ ]:
alpha mean_accuracy std
140 0.001872 0.886080 0.017318
141 0.001910 0.885510 0.016445
142 0.002273 0.885795 0.016721
143 0.002309 0.885795 0.016721
144 0.002383 0.885795 0.016721
In [ ]:
ideal_ccp_alpha = 0.002084

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["not val","val"],
          feature_names=X.columns)
Out[ ]:
[Text(397.575, 378.5785714285714, 'dominance <= 0.595\ngini = 0.354\nsamples = 3511\nvalue = [2703, 808]\nclass = not val'),
 Text(167.4, 320.3357142857143, 'masculinity <= 0.35\ngini = 0.225\nsamples = 2845\nvalue = [2477, 368]\nclass = not val'),
 Text(83.7, 262.09285714285716, 'arousal <= 0.35\ngini = 0.497\nsamples = 321\nvalue = [149, 172]\nclass = val'),
 Text(41.85, 203.85, 'gini = 0.313\nsamples = 72\nvalue = [58, 14]\nclass = not val'),
 Text(125.55000000000001, 203.85, 'dominance <= 0.52\ngini = 0.464\nsamples = 249\nvalue = [91, 158]\nclass = val'),
 Text(83.7, 145.60714285714283, 'aoa <= 0.287\ngini = 0.5\nsamples = 142\nvalue = [69, 73]\nclass = val'),
 Text(41.85, 87.3642857142857, 'gini = 0.289\nsamples = 40\nvalue = [7, 33]\nclass = val'),
 Text(125.55000000000001, 87.3642857142857, 'gini = 0.477\nsamples = 102\nvalue = [62, 40]\nclass = not val'),
 Text(167.4, 145.60714285714283, 'gini = 0.327\nsamples = 107\nvalue = [22, 85]\nclass = val'),
 Text(251.10000000000002, 262.09285714285716, 'arousal <= 0.453\ngini = 0.143\nsamples = 2524\nvalue = [2328, 196]\nclass = not val'),
 Text(209.25, 203.85, 'gini = 0.062\nsamples = 1777\nvalue = [1720, 57]\nclass = not val'),
 Text(292.95, 203.85, 'masculinity <= 0.507\ngini = 0.303\nsamples = 747\nvalue = [608, 139]\nclass = not val'),
 Text(251.10000000000002, 145.60714285714283, 'dominance <= 0.467\ngini = 0.484\nsamples = 244\nvalue = [144, 100]\nclass = not val'),
 Text(209.25, 87.3642857142857, 'gini = 0.288\nsamples = 103\nvalue = [85, 18]\nclass = not val'),
 Text(292.95, 87.3642857142857, 'semsize <= 0.485\ngini = 0.487\nsamples = 141\nvalue = [59, 82]\nclass = val'),
 Text(251.10000000000002, 29.121428571428567, 'gini = 0.415\nsamples = 51\nvalue = [36, 15]\nclass = not val'),
 Text(334.8, 29.121428571428567, 'gini = 0.38\nsamples = 90\nvalue = [23, 67]\nclass = val'),
 Text(334.8, 145.60714285714283, 'gini = 0.143\nsamples = 503\nvalue = [464, 39]\nclass = not val'),
 Text(627.75, 320.3357142857143, 'arousal <= 0.488\ngini = 0.448\nsamples = 666\nvalue = [226, 440]\nclass = val'),
 Text(502.20000000000005, 262.09285714285716, 'perceivability <= 0.463\ngini = 0.461\nsamples = 189\nvalue = [121, 68]\nclass = not val'),
 Text(460.35, 203.85, 'masculinity <= 0.447\ngini = 0.487\nsamples = 105\nvalue = [44, 61]\nclass = val'),
 Text(418.5, 145.60714285714283, 'gini = 0.185\nsamples = 29\nvalue = [3, 26]\nclass = val'),
 Text(502.20000000000005, 145.60714285714283, 'arousal <= 0.438\ngini = 0.497\nsamples = 76\nvalue = [41, 35]\nclass = not val'),
 Text(460.35, 87.3642857142857, 'gini = 0.436\nsamples = 56\nvalue = [38, 18]\nclass = not val'),
 Text(544.0500000000001, 87.3642857142857, 'gini = 0.255\nsamples = 20\nvalue = [3, 17]\nclass = val'),
 Text(544.0500000000001, 203.85, 'gini = 0.153\nsamples = 84\nvalue = [77, 7]\nclass = not val'),
 Text(753.3000000000001, 262.09285714285716, 'masculinity <= 0.684\ngini = 0.343\nsamples = 477\nvalue = [105, 372]\nclass = val'),
 Text(711.45, 203.85, 'arousal <= 0.575\ngini = 0.236\nsamples = 409\nvalue = [56, 353]\nclass = val'),
 Text(669.6, 145.60714285714283, 'masculinity <= 0.551\ngini = 0.447\nsamples = 83\nvalue = [28, 55]\nclass = val'),
 Text(627.75, 87.3642857142857, 'gini = 0.368\nsamples = 70\nvalue = [17, 53]\nclass = val'),
 Text(711.45, 87.3642857142857, 'gini = 0.26\nsamples = 13\nvalue = [11, 2]\nclass = not val'),
 Text(753.3000000000001, 145.60714285714283, 'gini = 0.157\nsamples = 326\nvalue = [28, 298]\nclass = val'),
 Text(795.15, 203.85, 'gini = 0.403\nsamples = 68\nvalue = [49, 19]\nclass = not val')]
In [ ]:
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average='weighted'))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['not val','val'])


fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
Accuracy 0.8736122971818958
F1-score 0.8681346409232079
              precision    recall  f1-score   support

         0.0       0.88      0.95      0.92       864
         1.0       0.83      0.65      0.73       307

    accuracy                           0.87      1171
   macro avg       0.86      0.80      0.82      1171
weighted avg       0.87      0.87      0.87      1171

0.8893394106647363
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)
In [ ]:

Dominance¶

In [ ]:
refvar="dominance"
taglio=0.57
In [ ]:
X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   yerr='std',
                   marker='o',
                   linestyle='--')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f87323e1e10>
In [ ]:
alpha_results[(alpha_results['alpha']>0.0018)
&
(alpha_results['alpha']<0.0041)]
Out[ ]:
alpha mean_accuracy std
197 0.001963 0.839931 0.007957
198 0.002448 0.837366 0.010027
199 0.002703 0.837364 0.011967
200 0.002885 0.835655 0.013418
201 0.003540 0.838219 0.014275
202 0.003945 0.838219 0.014275
203 0.003988 0.838219 0.014275
In [ ]:
ideal_ccp_alpha = 0.001963

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['not dominant','dominant'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["not dominant","dominant"],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)
Accuracy 0.8300597779675492
F1-score 0.6223908918406071
              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89       879
         1.0       0.70      0.56      0.62       292

    accuracy                           0.83      1171
   macro avg       0.78      0.74      0.76      1171
weighted avg       0.82      0.83      0.82      1171

0.8587416429005564

Familiarity¶

In [ ]:
refvar="familiarity"
taglio=0.6

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   marker='o',
                   linestyle='--')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8732336c50>
In [ ]:
alpha_results[(alpha_results['alpha']>0.0009)
&
(alpha_results['alpha']<0.0011)]
Out[ ]:
alpha mean_accuracy std
182 0.000904 0.824559 0.022300
183 0.000911 0.826553 0.021298
184 0.000930 0.829968 0.019192
185 0.000944 0.830822 0.018603
186 0.000977 0.831677 0.019085
187 0.001006 0.831108 0.018665
188 0.001008 0.831108 0.018665
189 0.001015 0.830539 0.018191
190 0.001017 0.830254 0.018189
191 0.001020 0.829399 0.017752
192 0.001047 0.829399 0.017475
193 0.001051 0.829399 0.017475
194 0.001082 0.828829 0.016620
In [ ]:
ideal_ccp_alpha = 0.000977

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['not dominant','dominant'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["not valueable","valueable"],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)
Accuracy 0.8121263877028181
F1-score 0.8692033293697977
              precision    recall  f1-score   support

         0.0       0.64      0.69      0.67       317
         1.0       0.88      0.86      0.87       854

    accuracy                           0.81      1171
   macro avg       0.76      0.77      0.77      1171
weighted avg       0.82      0.81      0.81      1171

0.8300814870086215

Semsize¶

In [ ]:
refvar="semsize"
taglio=0.63

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   marker='o',
                   linestyle='--')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8731d5dc10>
In [ ]:
alpha_results[(alpha_results['alpha']>0.0015)
&
(alpha_results['alpha']<0.002)]
Out[ ]:
alpha mean_accuracy std
229 0.001589 0.800630 0.022148
230 0.001601 0.801199 0.022718
231 0.001606 0.801199 0.022718
232 0.001663 0.797781 0.024255
233 0.001754 0.797211 0.023980
234 0.001766 0.797211 0.023980
235 0.001851 0.795502 0.027092
236 0.001869 0.795502 0.027092
237 0.001922 0.795217 0.026902
238 0.001968 0.795217 0.026902
In [ ]:
ideal_ccp_alpha = 0.001601

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['small','big'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["small","big"],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)
Accuracy 0.7950469684030743
F1-score [0.87261146 0.47598253]
              precision    recall  f1-score   support

         0.0       0.81      0.95      0.87       865
         1.0       0.72      0.36      0.48       306

    accuracy                           0.80      1171
   macro avg       0.76      0.65      0.67      1171
weighted avg       0.78      0.80      0.77      1171

0.7947183497676528

Masculinity¶

In [ ]:
refvar="masculinity"
taglio=0.6

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   marker='o',
                   linestyle='--')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f87322800d0>
In [ ]:
alpha_results[(alpha_results['alpha']>0.0015)
&
(alpha_results['alpha']<0.0025)]
Out[ ]:
alpha mean_accuracy std
252 0.001542 0.751076 0.016979
253 0.001555 0.751361 0.017161
254 0.001591 0.751645 0.016813
255 0.001603 0.752215 0.017575
256 0.001677 0.753923 0.013112
257 0.001754 0.751363 0.016140
258 0.001859 0.753642 0.018351
259 0.001972 0.754779 0.016440
260 0.001985 0.754779 0.015887
261 0.002007 0.754779 0.015887
262 0.002352 0.753066 0.011932
263 0.002381 0.750217 0.008731
In [ ]:
ideal_ccp_alpha = 0.001985

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['feminine','masculine'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["feminine","masculine"],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)
Accuracy 0.7762596071733561
F1-score 0.5130111524163568
              precision    recall  f1-score   support

         0.0       0.83      0.88      0.85       873
         1.0       0.57      0.46      0.51       298

    accuracy                           0.78      1171
   macro avg       0.70      0.67      0.68      1171
weighted avg       0.76      0.78      0.77      1171

0.7482279726623462

Polysemy¶

In [ ]:
refvar="polysemy"
taglio=0.63

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   marker='o',
                   linestyle='--')
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-173-fd3496329b61> in <module>()
     48 for ccp_alpha in ccp_alphas:
     49   clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
---> 50   scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
     51   alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
     52 

/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
    519         fit_params=fit_params,
    520         pre_dispatch=pre_dispatch,
--> 521         error_score=error_score,
    522     )
    523     return cv_results["test_score"]

/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
    281             error_score=error_score,
    282         )
--> 283         for train, test in cv.split(X, y, groups)
    284     )
    285 

/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in __call__(self, iterable)
   1044                 self._iterating = self._original_iterator is not None
   1045 
-> 1046             while self.dispatch_one_batch(iterator):
   1047                 pass
   1048 

/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    859                 return False
    860             else:
--> 861                 self._dispatch(tasks)
    862                 return True
    863 

/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in _dispatch(self, batch)
    777         with self._lock:
    778             job_idx = len(self._jobs)
--> 779             job = self._backend.apply_async(batch, callback=cb)
    780             # A job can complete so quickly than its callback is
    781             # called before we get here, causing self._jobs to

/usr/local/lib/python3.7/dist-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

/usr/local/lib/python3.7/dist-packages/joblib/_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in __call__(self)
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    262             return [func(*args, **kwargs)
--> 263                     for func, args, kwargs in self.items]
    264 
    265     def __reduce__(self):

/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in <listcomp>(.0)
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    262             return [func(*args, **kwargs)
--> 263                     for func, args, kwargs in self.items]
    264 
    265     def __reduce__(self):

/usr/local/lib/python3.7/dist-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    209     def __call__(self, *args, **kwargs):
    210         with config_context(**self.config):
--> 211             return self.function(*args, **kwargs)
    212 
    213 

/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
    679             estimator.fit(X_train, **fit_params)
    680         else:
--> 681             estimator.fit(X_train, y_train, **fit_params)
    682 
    683     except Exception:

/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    940             sample_weight=sample_weight,
    941             check_input=check_input,
--> 942             X_idx_sorted=X_idx_sorted,
    943         )
    944         return self

/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    418             )
    419 
--> 420         builder.build(self.tree_, X, y, sample_weight)
    421 
    422         if self.n_outputs_ == 1 and is_classifier(self):

KeyboardInterrupt: 
In [ ]:
alpha_results[(alpha_results['alpha']>0.001)
&
(alpha_results['alpha']<0.004)]
In [ ]:
ideal_ccp_alpha = 0.001944

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['not pol','pol'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["not pol","pol"],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average='weighted'))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

Perceivability¶

In [ ]:
refvar="perceivability"
taglio=0.8

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   marker='o',
                   linestyle='--')
In [ ]:
alpha_results[(alpha_results['alpha']>0.0014)
&
(alpha_results['alpha']<0.002)]
In [ ]:
ideal_ccp_alpha = 0.001499

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['not peveivable','perveivable'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["not perceivable","perceivable"],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

Age of Aquisition (multi split)¶

In [ ]:
refvar="aoa"

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   marker='o',
                   linestyle='--')
In [ ]:
alpha_results[(alpha_results['alpha']>0.0018)
&
(alpha_results['alpha']<0.0025)]
In [ ]:
ideal_ccp_alpha = 0.001858

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['0-2','2-4','4-6','6-8','8-10','10-12'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=['0-2','2-4','4-6','6-8','8-10','10-12'],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average=None))
print(classification_report(y_test, y_pred))

print(clf_dt_pruned.predict_proba(X_test))

report = classification_report(y_test, y_pred, output_dict=True)

export = pd.DataFrame(report).transpose()

print(export.to_latex())

plt.show()

Age of Aquisition (binary)¶

In [ ]:
refvar="aoa"
taglio=0.6

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0




X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   marker='o',
                   linestyle='-')
In [ ]:
alpha_results[(alpha_results['alpha']>0.0025)
&
(alpha_results['alpha']<0.005)]
In [ ]:
ideal_ccp_alpha = 0.003224

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['younger','older'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["younger","older"],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

Web Corpus Frequency¶

In [ ]:
refvar="web_corpus_freq"

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   yerr='std',
                   marker='o',
                   linestyle='--')
In [ ]:
alpha_results[(alpha_results['alpha']>0.0013)
&
(alpha_results['alpha']<0.0016)]
In [ ]:
ideal_ccp_alpha = 0.001376

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['4','5','6','7','8','9'],
                      )

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=['4','5','6','7','8','9'],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average=None))
print(classification_report(y_test, y_pred))

print(clf_dt_pruned.predict_proba(X_test))

report = classification_report(y_test, y_pred, output_dict=True)

export = pd.DataFrame(report).transpose()

print(export.to_latex())

3.1.1 Aoa class¶

In [ ]:
df_class_aoa= dfprepro.copy()
df_class_aoa.head()
In [ ]:
dataframe = [df_class_aoa]
In [ ]:
for dataset in dataframe:
    dataset.loc[(dataset["aoa"] > 1) & (dataset["aoa"] <= 2), "aoa"] = 1
    dataset.loc[(dataset["aoa"] > 2)& (dataset["aoa"] <= 3), "aoa"] = 2
    dataset.loc[(dataset["aoa"] > 3)& (dataset["aoa"] <= 4), "aoa"] = 3
    dataset.loc[(dataset["aoa"] > 4)& (dataset["aoa"] <= 5), "aoa"] = 4
    dataset.loc[(dataset["aoa"] > 5)& (dataset["aoa"] <= 6), "aoa"] = 5
    dataset.loc[(dataset["aoa"] > 6)&( dataset["aoa"] <= 7), "aoa"] = 6
    dataset.loc[(dataset["aoa"] > 7), "aoa"] = 7
df_class_aoa.head()
In [ ]:
attributes = [col for col in df_class_aoa.columns if col != 'aoa']
X = df_class_aoa[attributes].values
y = df_class_aoa['aoa']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=100)

len(df_class_aoa), X_train.shape[0], X_test.shape[0]
In [ ]:
X=df_class_aoa.drop("aoa",axis=1).copy()
X.dtypes
In [ ]:
y_encoded=pd.get_dummies(y,columns=["aoa"]).head()
In [ ]:
y_encoded.dtypes
In [ ]:
X_train.shape, X_test.shape
In [ ]:
clf = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1)
clf.fit(X_train, y_train)
In [ ]:
for col, imp in zip(attributes, clf.feature_importances_): print(col, imp)
In [ ]:
dot_data = tree.export_graphviz (clf, out_file=None,
                                feature_names=attributes,
                                class_names=[str(v) for v in clf.classes_],
                                filled=True, rounded=True,
                                special_characters=True,
                                max_depth=2)

graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
In [ ]:
# apply decision tree to train set
y_pred = clf.predict(X_train)
y_pred[:5]
In [ ]:
y_train.values[:5]
In [ ]:
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average=None))
In [ ]:
print( classification_report(y_train, y_pred) )
In [ ]:
# Confusion matrix for trainset
# TP, FN, FP, TN
confusion_matrix(y_train, y_pred)
In [ ]:
# apply decision tree to test set
y_pred = clf.predict(X_test)
In [ ]:
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)
In [ ]:
y_score = clf.predict_proba(X_test)
In [ ]:
plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

Length¶

In [ ]:
refvar="length"
taglio=0.35

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   marker='o',
                   linestyle='--')
In [ ]:
alpha_results[(alpha_results['alpha']>0.001)
&
(alpha_results['alpha']<0.0025)]
In [ ]:
ideal_ccp_alpha = 0.001538

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['short','long'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["short","long"],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

3.2 Classification by KNN¶

Age of Aquisition¶

choice of k¶

In [ ]:
from sklearn.neighbors import KNeighborsClassifier


refvar="aoa"
taglio=0.6

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

k = 4  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
Pred_y = neigh.predict(X_test)


error_rate = []
for i in range(1,100):
 knn = KNeighborsClassifier(n_neighbors=i)
 knn.fit(X_train,y_train)
 pred_i = knn.predict(X_test)
 error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,100),error_rate,color='blue', linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate)))
In [ ]:
acc = []
# Will take some time
from sklearn import metrics
for i in range(1,40):
    neigh = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat = neigh.predict(X_test)
    acc.append(metrics.accuracy_score(y_test, yhat))
    
plt.figure(figsize=(10,6))
plt.plot(range(1,40),acc,color = 'blue',linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('accuracy vs. K Value')
plt.xlabel('K')
plt.ylabel('Accuracy')
print("Maximum accuracy:-",max(acc),"at K =",acc.index(max(acc)))
In [ ]:
from sklearn.neighbors import KNeighborsClassifier


refvar="aoa"
taglio=0.6

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_knn = KNeighborsClassifier(n_neighbors=24)
clf_knn.fit(X, y)
In [ ]:
# apply KNN to train set
y_pred = clf_knn.predict(X_train)
y_pred[:5]
In [ ]:
y_train.values[:5]
In [ ]:
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average='weighted'))
print( classification_report(y_train, y_pred) )
In [ ]:
# Confusion matrix for trainset
# TP, FN, FP, TN
confusion_matrix(y_train, y_pred)
In [ ]:
# apply KNN to test set
y_pred = clf_knn.predict(X_test)
In [ ]:
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

plot_confusion_matrix(clf_knn,
                      X_test,
                      y_test,
                      display_labels=['younger','older'])
In [ ]:
y_score = clf_knn.predict_proba(X_test)
In [ ]:
fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)
In [ ]:
plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

Valence¶

choice of k¶

In [ ]:
from sklearn.neighbors import KNeighborsClassifier


refvar="valence"
taglio=0.67

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

k = 4  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
Pred_y = neigh.predict(X_test)


error_rate = []
for i in range(1,100):
 knn = KNeighborsClassifier(n_neighbors=i)
 knn.fit(X_train,y_train)
 pred_i = knn.predict(X_test)
 error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,100),error_rate,color='blue', linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate)))
In [ ]:
acc = []
# Will take some time
from sklearn import metrics
for i in range(1,40):
    neigh = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat = neigh.predict(X_test)
    acc.append(metrics.accuracy_score(y_test, yhat))
    
plt.figure(figsize=(10,6))
plt.plot(range(1,40),acc,color = 'blue',linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('accuracy vs. K Value')
plt.xlabel('K')
plt.ylabel('Accuracy')
print("Maximum accuracy:-",max(acc),"at K =",acc.index(max(acc)))
In [ ]:
refvar="valence"
taglio=0.67

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_knn = KNeighborsClassifier(n_neighbors=12)
clf_knn.fit(X, y)

y_pred = clf_knn.predict(X_train)
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average='weighted'))
print( classification_report(y_train, y_pred) )

confusion_matrix(y_train, y_pred)
In [ ]:
y_pred = clf_knn.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

plot_confusion_matrix(clf_knn,
                      X_test,
                      y_test,
                      display_labels=['not valuable','valuable'])

y_score = clf_knn.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)


plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

Polysemy¶

choice of k¶

In [ ]:
from sklearn.neighbors import KNeighborsClassifier


refvar="polysemy"
taglio=0.6

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

k = 4  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
Pred_y = neigh.predict(X_test)


error_rate = []
for i in range(1,100):
 knn = KNeighborsClassifier(n_neighbors=i)
 knn.fit(X_train,y_train)
 pred_i = knn.predict(X_test)
 error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,100),error_rate,color='blue', linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate)))
In [ ]:
acc = []
# Will take some time
from sklearn import metrics
for i in range(1,40):
    neigh = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat = neigh.predict(X_test)
    acc.append(metrics.average_precision_score(y_test, yhat))
    
plt.figure(figsize=(10,6))
plt.plot(range(1,40),acc,color = 'blue',linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('F1 score vs. K Value')
plt.xlabel('K')
plt.ylabel('F1 Score')
print("Maximum F1:-",max(acc),"at K =",acc.index(max(acc)))
In [ ]:
refvar="polysemy"
taglio=0.67

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_knn = KNeighborsClassifier(n_neighbors=6)
clf_knn.fit(X, y)

y_pred = clf_knn.predict(X_train)
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average='weighted'))
print( classification_report(y_train, y_pred) )

confusion_matrix(y_train, y_pred)
In [ ]:
y_pred = clf_knn.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

plot_confusion_matrix(clf_knn,
                      X_test,
                      y_test,
                      display_labels=['not polysemic','polysemic'])

y_score = clf_knn.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)


plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

Age of Acquisition (multi)¶

choice of k¶

In [ ]:
from sklearn.neighbors import KNeighborsClassifier


refvar="aoa"

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

k = 4  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
Pred_y = neigh.predict(X_test)


error_rate = []
for i in range(1,100):
 knn = KNeighborsClassifier(n_neighbors=i)
 knn.fit(X_train,y_train)
 pred_i = knn.predict(X_test)
 error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,100),error_rate,color='blue', linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate)))
In [ ]:
clf_knn = KNeighborsClassifier(n_neighbors=37)
clf_knn.fit(X, y)

y_pred = clf_knn.predict(X_train)
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average='weighted'))
print( classification_report(y_train, y_pred) )

confusion_matrix(y_train, y_pred)
In [ ]:
y_pred = clf_knn.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

plot_confusion_matrix(clf_knn,
                      X_test,
                      y_test,
                      display_labels=['0-2','2-4','4-6','6-8','8-10'])

y_score = clf_knn.predict_proba(X_test)

report = classification_report(y_test, y_pred, output_dict=True)

export = pd.DataFrame(report).transpose()

print(export.to_latex())

Random Forest¶

In [ ]:
from sklearn.ensemble import RandomForestClassifier

Valence¶

In [ ]:
refvar='valence'
taglio=0.67

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Instantiate model with 10 decision trees
model = RandomForestClassifier(n_estimators = 380, random_state = 42)
# Train the model on training data
ra=model.fit(X_train, y_train)


#TESTING THE MODEL BY PREDICTING ON TEST DATA
#AND CALCULATE THE ACCURACY SCORE

prediction_test = model.predict(X_test)
#print(y_test, prediction_test)

#Print the prediction accuracy
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_test))
#Test accuracy for various test sizes and see how it gets better with more training data

#One amazing feature of Random forest is that it provides us info on feature importances
# Get numerical feature importances
#importances = list(model.feature_importances_)

#Let us print them into a nice format.

feature_list = list(X.columns)
feature_imp = pd.Series(model.feature_importances_,index=feature_list).sort_values(ascending=False)
print(feature_imp)

y_pred = model.predict(X_train)
y_pred = model.predict(X_test)

plot_confusion_matrix(ra,
                      X_test,
                      y_test,
                      display_labels=['not val','val'],
                      )

y_score = model.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average='weighted'))
print(classification_report(y_test, y_pred))

Polysemy¶

In [ ]:
refvar='polysemy'
taglio=0.67

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# Instantiate model with 10 decision trees
model = RandomForestClassifier(n_estimators = 385, random_state = 42)
# Train the model on training data
ra=model.fit(X_train, y_train)


#TESTING THE MODEL BY PREDICTING ON TEST DATA
#AND CALCULATE THE ACCURACY SCORE

prediction_test = model.predict(X_test)
#print(y_test, prediction_test)

#Print the prediction accuracy
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_test))
#Test accuracy for various test sizes and see how it gets better with more training data

#One amazing feature of Random forest is that it provides us info on feature importances
# Get numerical feature importances
#importances = list(model.feature_importances_)

#Let us print them into a nice format.

feature_list = list(X.columns)
feature_imp = pd.Series(model.feature_importances_,index=feature_list).sort_values(ascending=False)
print(feature_imp)


y_pred = model.predict(X_train)
y_pred = model.predict(X_test)

plot_confusion_matrix(ra,
                      X_test,
                      y_test,
                      display_labels=['not pol','pol']
                      )

y_score = model.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average='weighted'))
print(classification_report(y_test, y_pred))

Age of Aquisition¶

In [ ]:
refvar='aoa'
taglio=0.6

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Instantiate model with 10 decision trees
model = RandomForestClassifier(n_estimators = 380, random_state = 42)
# Train the model on training data
ra=model.fit(X_train, y_train)


#TESTING THE MODEL BY PREDICTING ON TEST DATA
#AND CALCULATE THE ACCURACY SCORE

prediction_test = model.predict(X_test)
#print(y_test, prediction_test)

#Print the prediction accuracy
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_test))
#Test accuracy for various test sizes and see how it gets better with more training data

#One amazing feature of Random forest is that it provides us info on feature importances
# Get numerical feature importances
#importances = list(model.feature_importances_)

#Let us print them into a nice format.

feature_list = list(X.columns)
feature_imp = pd.Series(model.feature_importances_,index=feature_list).sort_values(ascending=False)
print(feature_imp)


y_pred = model.predict(X_train)
y_pred = model.predict(X_test)

plot_confusion_matrix(ra,
                      X_test,
                      y_test,
                      display_labels=['younger','older']
                      )

y_score = model.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average='weighted'))
print(classification_report(y_test, y_pred))

Out of bag error¶

In [ ]:
import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 42

refvar='valence'
taglio=0.67

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0


# NOTE: Setting the `warm_start` construction parameter to `True` disables
# support for parallelized ensembles but is necessary for tracking the OOB
# error trajectory during training.
ensemble_clfs = [
    (
        "RandomForestClassifier, max_features='sqrt'",
        RandomForestClassifier(
            warm_start=True,
            oob_score=True,
            max_features="sqrt",
            random_state=RANDOM_STATE,
        ),
    ),
    (
        "RandomForestClassifier, max_features='log2'",
        RandomForestClassifier(
            warm_start=True,
            max_features="log2",
            oob_score=True,
            random_state=RANDOM_STATE,
        ),
    ),
    (
        "RandomForestClassifier, max_features=None",
        RandomForestClassifier(
            warm_start=True,
            max_features=None,
            oob_score=True,
            random_state=RANDOM_STATE,
        ),
    ),
]

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# Range of `n_estimators` values to explore.
min_estimators = 100
max_estimators = 1000

for label, clf in ensemble_clfs:
    for i in range(min_estimators, max_estimators + 1, 5):
        clf.set_params(n_estimators=i)
        clf.fit(X, y)

        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()

Entropy (Decision Tree)¶

Age of Acquisition¶

In [ ]:
refvar="aoa"
taglio=0.6

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0




X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='entropy',random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   yerr='std',
                   marker='o',
                   linestyle='-')
In [ ]:
alpha_results[(alpha_results['alpha']>0.002)
&
(alpha_results['alpha']<0.004)]
In [ ]:
ideal_ccp_alpha = 0.002246

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['young','old'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["young","old"],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

Polysemy¶

In [ ]:
refvar="polysemy"
taglio=0.6

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0




X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='entropy',random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   yerr='std',
                   marker='o',
                   linestyle='-')
In [ ]:
alpha_results[(alpha_results['alpha']>0.002)
]
In [ ]:
ideal_ccp_alpha = 0.003002

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['not polysemic','polysemic'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["not polysemic","polysemic"],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

Valence¶

In [ ]:
refvar="valence"
taglio=0.67

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0




X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)

clf_dt = clf_dt.fit(X_train, y_train)

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts=[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
  clf_dt.fit(X_train, y_train)
  clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')

ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()

alpha_loop_values =[]

for ccp_alpha in ccp_alphas:
  clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
  scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
  alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                               columns=['alpha','mean_accuracy','std'])
  
alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   yerr='std',
                   marker='o',
                   linestyle='-')
In [ ]:
alpha_results[(alpha_results['alpha']>0.0025)
&
(alpha_results['alpha']<0.0035)]
In [ ]:
ideal_ccp_alpha = 0.003041

ideal_ccp_alpha = float(ideal_ccp_alpha)

clf_dt_pruned = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['not valuable','valuable'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["not valuable","valuable"],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

Citraro¶

In [ ]:
###### Lecture Pipeline ######
# you can/have to modify the order for your project,
# e.g. hyperparameter tuning before evaluating a decision tree

# Hold-out
from sklearn.model_selection import train_test_split

# Decision Tree
from sklearn.tree import DecisionTreeClassifier
# Optional: KNN
from sklearn.neighbors import KNeighborsClassifier

##### Decision Tree Visualization ###
from sklearn import tree # export_graphviz() method
import pydotplus # viz stuff
from IPython.display import Image # viz stuff
####################################

######## Evaluation #######
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
# Optional: baseline comparison
from sklearn.dummy import DummyClassifier
###########################

# hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# cross-validation
from sklearn.model_selection import cross_val_score
In [ ]:
refvar="valence"
taglio=0.67

X=df_class_ref.drop(refvar,axis=1).copy()

y=df_class_ref[refvar].copy()

y_up_index = y >= taglio

y[y_up_index]=1

y_zero_index = y < taglio

y[y_zero_index]=0




X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
In [ ]:
clf = DecisionTreeClassifier(criterion='gini',
                             max_depth=None ,
                             min_samples_split=10,
                             min_samples_leaf=5,
                            )

clf.fit(X_train, y_train)
In [ ]:
imp_dict = {attr: clf.feature_importances_[i] for i, attr in enumerate(attributes)}
imp_dict

{k: v for k, v in sorted(imp_dict.items(), key=lambda item: item[1], reverse=True)}
In [ ]:
dot_data = tree.export_graphviz (clf, out_file=None,
                                feature_names=attributes,
                                class_names=[str(v) for v in clf.classes_],
                                filled=True, rounded=True,
                                special_characters=True,
                                max_depth=2)

graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
In [ ]:
y_pred = clf.predict(X_train)
In [ ]:
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average=None))
In [ ]:
print( classification_report(y_train, y_pred) )
In [ ]:
# NOT SUGGESTED FOR THE PROJECT

cf = confusion_matrix(y_train, y_pred)
cf
In [ ]:
y_pred = clf.predict( X_test )
In [ ]:
y_score = clf.predict_proba(X_test)
y_score[:6]
In [ ]:
fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
In [ ]:
dummy_clf = DummyClassifier(strategy='constant', constant=1)
dummy_clf.fit(X_test, y_test)

y_pred = dummy_clf.predict(X_test)
In [ ]:
param_list = {'max_depth': [None] + [2, 3, 4],
             'min_samples_split': [2, 5, 10, 20],
             'min_samples_leaf': [1, 5, 10, 20]
             }

grid_search = GridSearchCV(clf, param_grid=param_list, scoring='f1')
grid_search.fit(X, y)

res = grid_search.cv_results_
In [ ]:
grid_search.best_estimator_
In [ ]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
In [ ]:
report(res, n_top=3)
In [ ]:
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=5, min_samples_leaf=20)

clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=['not valuable','valuable'])

plt.figure(figsize=(15,7.5))

from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
          filled=True,
          rounded=True,
          class_names=["not valuable","valuable"],
          feature_names=X.columns)

y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

y_score = clf_dt_pruned.predict_proba(X_test)

fpr, tpr, th = roc_curve(y_test, y_score[:,1])

roc_auc = auc(fpr, tpr)
print(roc_auc)

plt.figure(figsize=(8,5))

plt.plot(fpr, tpr)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)

plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

4 Patter mining¶

4.1 Preprocessing¶

In [ ]:
dfprepro.head()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
0 8 4.200 2.864 4.333 2.382 6.760 4.652 5.391 0 5.0 4.9230
1 5 3.125 5.781 4.667 3.324 5.177 5.121 3.303 0 6.0 5.6250
2 10 3.273 5.250 5.235 5.121 5.543 2.667 3.971 0 5.0 3.2315
3 8 4.194 3.767 4.419 3.971 6.233 4.679 5.167 0 5.0 2.9415
4 10 3.846 3.880 4.800 3.097 6.407 5.083 4.571 0 5.0 2.9315
In [ ]:
dfprepro.describe()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
count 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000 4682.000000
mean 6.348355 4.678129 5.086797 5.044939 5.271335 4.143427 4.136403 4.099933 0.080948 6.285135 4.644645
std 2.006230 1.097163 1.594344 0.930669 0.921218 1.252770 1.023293 0.912293 0.272785 0.843987 1.366345
min 2.000000 2.057000 1.030000 1.941000 1.647000 1.219000 1.375000 1.000000 0.000000 4.000000 1.803500
25% 5.000000 3.849000 4.115000 4.529000 4.706000 3.114000 3.438000 3.606000 0.000000 6.000000 3.407000
50% 6.000000 4.571000 5.290000 5.123000 5.438000 4.177000 4.186500 4.121000 0.000000 6.000000 4.562500
75% 8.000000 5.419000 6.088000 5.600000 5.969000 5.152000 4.882000 4.656000 0.000000 7.000000 5.970500
max 16.000000 8.177000 8.647000 8.371000 6.939000 6.971000 6.912000 6.971000 1.000000 9.000000 6.925000
In [ ]:
df_pm= dfprepro.copy()
In [ ]:
var_to_scale=['length','aoa',"arousal","valence","dominance","familiarity","semsize","masculinity","perceivability"]

features = df_pm[var_to_scale]
scaler = MinMaxScaler().fit(features.values)
features = scaler.transform(features.values)


df_pm[var_to_scale] = 4*features-0.5
df_pm.head()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
0 1.214286 0.900654 0.463109 0.988025 0.055556 3.353268 1.867347 2.441551 0 5.0 1.936396
1 0.357143 0.198039 1.994946 1.195801 0.767574 2.252434 2.206159 1.042790 0 6.0 2.484672
2 1.785714 0.294771 1.716096 1.549145 2.125850 2.506954 0.433357 1.490286 0 5.0 0.615298
3 1.214286 0.896732 0.937311 1.041524 1.256614 2.986787 1.886852 2.291492 0 5.0 0.388802
4 1.785714 0.669281 0.996652 1.278538 0.595994 3.107789 2.178707 1.892229 0 5.0 0.380992
In [ ]:
df_pm=df_pm.round()
In [ ]:
df_pm.head()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
0 1.0 1.0 0.0 1.0 0.0 3.0 2.0 2.0 0 5.0 2.0
1 0.0 0.0 2.0 1.0 1.0 2.0 2.0 1.0 0 6.0 2.0
2 2.0 0.0 2.0 2.0 2.0 3.0 0.0 1.0 0 5.0 1.0
3 1.0 1.0 1.0 1.0 1.0 3.0 2.0 2.0 0 5.0 0.0
4 2.0 1.0 1.0 1.0 1.0 3.0 2.0 2.0 0 5.0 0.0
In [ ]:
df_pm.isnull().sum()
Out[ ]:
length              0
arousal             0
valence             0
dominance           0
familiarity         0
aoa                 0
semsize             0
masculinity         0
polysemy            0
web_corpus_freq    14
perceivability      0
dtype: int64
In [ ]:
df_pm_copy=df_pm.copy()
In [ ]:
df_pm.loc[(df_pm['web_corpus_freq'].isnull() == True), 'web_corpus_freq'] = df_pm['web_corpus_freq'].mean()
In [ ]:
df_pm['length'] = df_pm['length'].astype(str) + '_Lenght'
df_pm['arousal'] = df_pm['arousal'].astype(str) + '_Arousal'
df_pm['valence'] = df_pm['valence'].astype(str) + '_Valence'
df_pm['dominance'] = df_pm['dominance'].astype(str) + '_Dominance'
df_pm['familiarity'] = df_pm['familiarity'].astype(str) + '_Familiarity'
df_pm['aoa'] = df_pm['aoa'].astype(str) + '_Age_of_Aquisition'
df_pm['semsize'] = df_pm['semsize'].astype(str) + '_SemSize'
df_pm['masculinity'] = df_pm['masculinity'].astype(str) + '_Masculinity'
df_pm['web_corpus_freq'] = df_pm['web_corpus_freq'].astype(str) + '_Web_Corpus_Freq'
df_pm['perceivability'] = df_pm['perceivability'].astype(str) + '_Perceivability'


df_pm.head()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
0 1.0_Lenght 1.0_Arousal 0.0_Valence 1.0_Dominance 0.0_Familiarity 3.0_Age_of_Aquisition 2.0_SemSize 2.0_Masculinity 0 5.0_Web_Corpus_Freq 2.0_Perceivability
1 0.0_Lenght 0.0_Arousal 2.0_Valence 1.0_Dominance 1.0_Familiarity 2.0_Age_of_Aquisition 2.0_SemSize 1.0_Masculinity 0 6.0_Web_Corpus_Freq 2.0_Perceivability
2 2.0_Lenght 0.0_Arousal 2.0_Valence 2.0_Dominance 2.0_Familiarity 3.0_Age_of_Aquisition 0.0_SemSize 1.0_Masculinity 0 5.0_Web_Corpus_Freq 1.0_Perceivability
3 1.0_Lenght 1.0_Arousal 1.0_Valence 1.0_Dominance 1.0_Familiarity 3.0_Age_of_Aquisition 2.0_SemSize 2.0_Masculinity 0 5.0_Web_Corpus_Freq 0.0_Perceivability
4 2.0_Lenght 1.0_Arousal 1.0_Valence 1.0_Dominance 1.0_Familiarity 3.0_Age_of_Aquisition 2.0_SemSize 2.0_Masculinity 0 5.0_Web_Corpus_Freq 0.0_Perceivability
In [ ]:
polysemy_dict = {0: 'Not Polysemy', 1: 'Polysemy'}
df_pm['polysemy'] = df_pm['polysemy'].map(polysemy_dict)

df_pm.head()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
0 1.0_Lenght 1.0_Arousal 0.0_Valence 1.0_Dominance 0.0_Familiarity 3.0_Age_of_Aquisition 2.0_SemSize 2.0_Masculinity Not Polysemy 5.0_Web_Corpus_Freq 2.0_Perceivability
1 0.0_Lenght 0.0_Arousal 2.0_Valence 1.0_Dominance 1.0_Familiarity 2.0_Age_of_Aquisition 2.0_SemSize 1.0_Masculinity Not Polysemy 6.0_Web_Corpus_Freq 2.0_Perceivability
2 2.0_Lenght 0.0_Arousal 2.0_Valence 2.0_Dominance 2.0_Familiarity 3.0_Age_of_Aquisition 0.0_SemSize 1.0_Masculinity Not Polysemy 5.0_Web_Corpus_Freq 1.0_Perceivability
3 1.0_Lenght 1.0_Arousal 1.0_Valence 1.0_Dominance 1.0_Familiarity 3.0_Age_of_Aquisition 2.0_SemSize 2.0_Masculinity Not Polysemy 5.0_Web_Corpus_Freq 0.0_Perceivability
4 2.0_Lenght 1.0_Arousal 1.0_Valence 1.0_Dominance 1.0_Familiarity 3.0_Age_of_Aquisition 2.0_SemSize 2.0_Masculinity Not Polysemy 5.0_Web_Corpus_Freq 0.0_Perceivability

4.2 Patter mining¶

Frequent itemsets¶

In [ ]:
X = df_pm.values.tolist()
In [ ]:
X[0]
Out[ ]:
['8_Lenght',
 '3.0_Arousal',
 '2.0_Valence',
 '4.0_Dominance',
 '2.0_Familiarity',
 '6.0_Age_of_Aquisition',
 '4.0_SemSize',
 '5.0_Masculinity',
 'Not Polysemy',
 '5.0_Web_Corpus_Freq',
 '4.0_Perceivability']
In [ ]:
help(apriori)
Help on built-in function apriori in module fim:

apriori(...)
    apriori (tracts, target='s', supp=10, zmin=1, zmax=None, report='a',
             eval='x', agg='x', thresh=10, prune=None, algo='b', mode='',
             border=None)
    Find frequent item sets with the Apriori algorithm.
    tracts  transaction database to mine (mandatory)
            The database must be an iterable of transactions;
            each transaction must be an iterable of items;
            each item must be a hashable object.
            If the database is a dictionary, the transactions are
            the keys, the values their (integer) multiplicities.
    target  type of frequent item sets to find     (default: s)
            s/a   sets/all   all     frequent item sets
            c     closed     closed  frequent item sets
            m     maximal    maximal frequent item sets
            g     gens       generators
            r     rules      association rules
    supp    minimum support of an item set         (default: 10)
            (positive: percentage, negative: absolute number)
    conf    minimum confidence of an assoc. rule   (default: 80%)
    zmin    minimum number of items per item set   (default: 1)
    zmax    maximum number of items per item set   (default: no limit)
    report  values to report with an item set      (default: a)
            a     absolute item set support (number of transactions)
            s     relative item set support as a fraction
            S     relative item set support as a percentage
            e     value of item set evaluation measure
            E     value of item set evaluation measure as a percentage
            (     combine values in a tuple (must be first character)
            [     combine values in a list  (must be first character)
            #     pattern spectrum as a dictionary  (no patterns)
            =     pattern spectrum as a list        (no patterns)
            |     pattern spectrum as three columns (no patterns)
            for target 'r' (association rules) also available:
            b     absolute body set  support (number of transactions)
            x     relative body set  support as a fraction
            X     relative body set  support as a percentage
            h     absolute head item support (number of transactions)
            y     relative head item support as a fraction
            Y     relative head item support as a percentage
            c     rule confidence as a fraction
            C     rule confidence as a percentage
            l     lift value of a rule (confidence/prior)
            L     lift value of a rule as a percentage
            Q     support of the empty set (total number of transactions)
    eval    measure for item set evaluation        (default: x)
            x     none       no measure / zero (default)
            b     ldratio    binary logarithm of support quotient       (+)
            c     conf       rule confidence                            (+)
            d     confdiff   absolute confidence difference to prior    (+)
            l     lift       lift value (confidence divided by prior)   (+)
            a     liftdiff   absolute difference of lift value to 1     (+)
            q     liftquot   difference of lift quotient to 1           (+)
            v     cvct       conviction (inverse lift for negated head) (+)
            e     cvctdiff   absolute difference of conviction to 1     (+)
            r     cvctquot   difference of conviction quotient to 1     (+)
            k     cprob      conditional probability ratio              (+)
            j     import     importance (binary log. of prob. ratio)    (+)
            z     cert       certainty factor (relative conf. change)   (+)
            n     chi2       normalized chi^2 measure                   (+)
            p     chi2pval   p-value from (unnormalized) chi^2 measure  (-)
            y     yates      normalized chi^2 with Yates' correction    (+)
            t     yatespval  p-value from Yates-corrected chi^2 measure (-)
            i     info       information difference to prior            (+)
            g     infopval   p-value from G statistic/info. difference  (-)
            f     fetprob    Fisher's exact test (table probability)    (-)
            h     fetchi2    Fisher's exact test (chi^2 measure)        (-)
            m     fetinfo    Fisher's exact test (mutual information)   (-)
            s     fetsupp    Fisher's exact test (support)              (-)
            Measures marked with (+) must meet or exceed the threshold,
            measures marked with (-) must not exceed the threshold
            in order for the item set to be reported.
    agg     evaluation measure aggregation mode    (default: x)
            x     none       no aggregation (use first value)
            m     min        minimum of individual measure values
            n     max        maximum of individual measure values
            a     avg        average of individual measure values
    thresh  threshold for evaluation measure       (default: 10%)
    prune   min. size for evaluation filtering     (default: no pruning)
            = 0   backward filtering       (no subset check)
            < 0   weak   forward filtering (one subset  must qualify)
            > 0   strong forward filtering (all subsets must qualify)
    algo    algorithm variant to use               (default: a)
            b     basic      standard algorithm (only choice)
    mode    operation mode indicators/flags        (default: None)
            x     do not use perfect extension pruning
            t/T   do not organize transactions as a prefix tree
            y     a-posteriori pruning of infrequent item sets
            z     invalidate evaluation below expected support
            o     use original rule support definition (body & head)
    border  support border for filtering item sets (default: None)
            Must be a list or tuple of (absolute) minimum support values
            per item set size (by which the list/tuple is indexed).
    appear  dictionary mapping items to item appearance indicators,
            with the key None referring to the default item appearance.
            (If None does not occur as a key or no dictionary is given,
            the default item appearance indicator is 'both'.)
            This parameter is only used if the target type is rules.
            * item may not appear anywhere in a rule:
              '-', 'n', 'none', 'neither', 'ignore'
            * item may appear only in rule body/antecedent:
              'i', 'in', 'inp', 'input', 'b', 'body',
              'a', 'ante', 'antecedent'
            * item may appear only in rule head/consequent:
              'o', 'out',      'output', 'h', 'head',
              'c', 'cons', 'consequent'
            * item may appear anywhere in a rule:
              'io', 'i&o', 'inout', 'in&out', 'bh', 'b&h', 'both'
    returns if report is not in ['#','=','|']:
              if the target is association rules:
                a list of rules (i.e. tuples with two or more elements),
                each consisting of a head/consequent item, a tuple with
                a body/antecedent item set, and the values selected by
                the parameter 'report', which may be combined into a
                tuple or a list if report[0] is '(' or '[', respectively.
              if the target is a type of item sets:
                a list of patterns (i.e. tuples with one or more elements),
                each consisting of a tuple with a found frequent item set
                and the values selected by the parameter 'report', which
                may be combined into a tuple or list if report[0] is '('
                or '[', respectively
            if report in ['#','=','|']:
              a pattern spectrum as a dictionary mapping pattern sizes
              to the corresponding occurrence support ranges, as a list
              of triplets (size, min. support, max. support) or as three
              columns for sizes and minimum and maximum support values

In [ ]:
itemsets = apriori(X, target='s', supp=2, zmin=2, report='S')
In [ ]:
itemsets[:5]
Out[ ]:
[(('8.0_Valence', 'Not Polysemy'), 2.1785561725758225),
 (('6.0_SemSize', 'Not Polysemy'), 2.2212729602733874),
 (('11_Lenght', 'Not Polysemy'), 2.3067065356685177),
 (('1.0_Masculinity', 'Not Polysemy'), 2.285348141819735),
 (('2.0_Dominance', 'Not Polysemy'), 2.5630072618539086)]
In [ ]:
# frequent itemset
itemsets[0][0]
Out[ ]:
('8.0_Valence', 'Not Polysemy')
In [ ]:
# support
itemsets[0][1]
Out[ ]:
2.1785561725758225
In [ ]:
itemsets = apriori(X, target='m', supp=2, zmin=2, report='S')
In [ ]:
itemsets[:5]
Out[ ]:
[(('8.0_Valence', 'Not Polysemy'), 2.1785561725758225),
 (('6.0_SemSize', 'Not Polysemy'), 2.2212729602733874),
 (('11_Lenght', 'Not Polysemy'), 2.3067065356685177),
 (('1.0_Masculinity', 'Not Polysemy'), 2.285348141819735),
 (('2.0_Dominance', 'Not Polysemy'), 2.5630072618539086)]
In [ ]:

In [ ]:
itemsets[0][0]
Out[ ]:
('8.0_Valence', 'Not Polysemy')
In [ ]:
df_pm.describe()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity polysemy web_corpus_freq perceivability
count 4682 4682 4682 4682 4682 4682 4682 4682 4682 4682 4682
unique 4 4 4 4 4 4 4 4 2 7 4
top 1.0_Lenght 1.0_Arousal 2.0_Valence 1.0_Dominance 2.0_Familiarity 2.0_Age_of_Aquisition 2.0_SemSize 2.0_Masculinity Not Polysemy 6.0_Web_Corpus_Freq 1.0_Perceivability
freq 3407 2905 2684 2300 2964 2113 2299 2625 4303 2122 1940
In [ ]:
len_max_it = []
len_cl_it = []
len_all_it = []
for i in range(1, 8+1):
    max_itemsets = apriori(X, target='m', supp=i, zmin=3)
    cl_itemsets = apriori(X, target='c', supp=i, zmin=3)
    all_itemsets = apriori(X, target='s', supp=i, zmin=3)
    len_max_it.append( len(max_itemsets)  )
    len_cl_it.append( len(cl_itemsets) )
    len_all_it.append( len(all_itemsets) )
    
plt.plot(len_max_it, label='maximal')
plt.plot(len_all_it, label='all')
plt.plot(len_cl_it, label='closed')
plt.legend(fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('%support', fontsize=15)

plt.show()
In [ ]:
len_max_it = []
len_cl_it = []
len_all_it = []
for i in range(1, 6+1):
    max_itemsets = apriori(X, target='m', supp=2, zmin=i)
    cl_itemsets = apriori(X, target='c', supp=2, zmin=i)
    all_itemsets = apriori(X, target='s', supp=2, zmin=i)
    len_max_it.append( len(max_itemsets)  )
    len_cl_it.append( len(cl_itemsets) )
    len_all_it.append( len(all_itemsets) )
    
plt.plot(len_max_it, label='maximal')
plt.plot(len_cl_it, label='closed')
plt.plot(len_all_it, label='all')
plt.legend(fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('#zmin', fontsize=15)

plt.show()
In [ ]:
df_no_pol=df_pm.drop('polysemy',axis=1)
In [ ]:
df_pm.head()
Out[ ]:
length arousal valence dominance familiarity aoa semsize masculinity web_corpus_freq perceivability
0 8_Lenght 4.0_Arousal 2.0_Valence 4.0_Dominance 1.0_Familiarity 10.0_Age_of_Aquisition 6.0_SemSize 7.0_Masculinity 5.0_Web_Corpus_Freq 6.0_Perceivability
1 5_Lenght 2.0_Arousal 6.0_Valence 4.0_Dominance 3.0_Familiarity 7.0_Age_of_Aquisition 7.0_SemSize 4.0_Masculinity 6.0_Web_Corpus_Freq 7.0_Perceivability
2 10_Lenght 2.0_Arousal 6.0_Valence 5.0_Dominance 7.0_Familiarity 8.0_Age_of_Aquisition 2.0_SemSize 5.0_Masculinity 5.0_Web_Corpus_Freq 3.0_Perceivability
3 8_Lenght 3.0_Arousal 4.0_Valence 4.0_Dominance 4.0_Familiarity 9.0_Age_of_Aquisition 6.0_SemSize 7.0_Masculinity 5.0_Web_Corpus_Freq 2.0_Perceivability
4 10_Lenght 3.0_Arousal 4.0_Valence 4.0_Dominance 3.0_Familiarity 9.0_Age_of_Aquisition 7.0_SemSize 6.0_Masculinity 5.0_Web_Corpus_Freq 2.0_Perceivability
In [ ]:
X_no_pol = df_no_pol.values.tolist()
In [ ]:
len_max_it = []
len_cl_it = []
len_all_it = []
for i in range(1, 5+1):
    max_itemsets = apriori(X, target='m', supp=i, zmin=3)
    cl_itemsets = apriori(X, target='c', supp=i, zmin=3)
    all_itemsets = apriori(X, target='s', supp=i, zmin=3)
    len_max_it.append( len(max_itemsets)  )
    len_cl_it.append( len(cl_itemsets) )
    len_all_it.append( len(all_itemsets) )
    
plt.plot(len_all_it, label='all')
plt.plot(len_max_it, label='maximal')
plt.plot(len_cl_it, label='closed')
plt.legend(fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('%support', fontsize=15)

plt.show()
In [ ]:
len_max_it = []
len_cl_it = []
len_all_it = []
for i in range(1, 6+1):
    max_itemsets = apriori(X, target='m', supp=2, zmin=i)
    cl_itemsets = apriori(X, target='c', supp=2, zmin=i)
    all_itemsets = apriori(X, target='s', supp=2, zmin=i)
    len_max_it.append( len(max_itemsets)  )
    len_cl_it.append( len(cl_itemsets) )
    len_all_it.append( len(all_itemsets) )

plt.plot(len_all_it, label='all')
plt.plot(len_max_it, label='maximal')
plt.plot(len_cl_it, label='closed')
plt.legend(fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('%support', fontsize=15)

plt.show()
In [ ]:
filter_0 = []
filter_1 = []


for i in range(2, 8+1):
    max_itemsets = apriori(X, target='a', supp=i, zmin=3)
    
    len_filter_0=0

    len_filter_0+=len([item for item in max_itemsets if '0.0_Age_of_Aquisition' in item[0]])
    len_filter_0+=len([item for item in max_itemsets if '1.0_Age_of_Aquisition' in item[0]])
    len_filter_0+=len([item for item in max_itemsets if '2.0_Age_of_Aquisition' in item[0]])
    len_filter_0+=len([item for item in max_itemsets if '3.0_Age_of_Aquisition' in item[0]])
    len_filter_0+=len([item for item in max_itemsets if '4.0_Age_of_Aquisition' in item[0]])
    len_filter_0+=len([item for item in max_itemsets if '5.0_Age_of_Aquisition' in item[0]])

    filter_0.append( len_filter_0 )

    len_filter_1=0
    len_filter_1+=len([item for item in max_itemsets if '6.0_Age_of_Aquisition' in item[0]])
    len_filter_1+=len([item for item in max_itemsets if '7.0_Age_of_Aquisition' in item[0]])
    len_filter_1+=len([item for item in max_itemsets if '8.0_Age_of_Aquisition' in item[0]])
    len_filter_1+=len([item for item in max_itemsets if '9.0_Age_of_Aquisition' in item[0]])
    len_filter_1+=len([item for item in max_itemsets if '10.0_Age_of_Aquisition' in item[0]])

    filter_1.append( len_filter_1 )

    
plt.plot(filter_0, label='younger')
plt.plot(filter_1, label='older')



plt.legend(fontsize=15)
plt.xticks(fontsize=20)
plt.xlabel('%support', fontsize=15)

plt.show()
In [ ]:
filter_0 = []
filter_1 = []


for i in range(2, 8+1):
    max_itemsets = apriori(X, target='a', supp=i, zmin=3)
    
    len_filter_0=0

    len_filter_0+=len([item for item in max_itemsets if '0.0_Valence' in item[0]])
    len_filter_0+=len([item for item in max_itemsets if '1.0_Valence' in item[0]])
    len_filter_0+=len([item for item in max_itemsets if '2.0_Valence' in item[0]])
    len_filter_0+=len([item for item in max_itemsets if '3.0_Valence' in item[0]])
    len_filter_0+=len([item for item in max_itemsets if '4.0_Valence' in item[0]])
    len_filter_0+=len([item for item in max_itemsets if '5.0_Valence' in item[0]])

    filter_0.append( len_filter_0 )

    len_filter_1=0
    len_filter_1+=len([item for item in max_itemsets if '6.0_Valence' in item[0]])
    len_filter_1+=len([item for item in max_itemsets if '7.0_Valence' in item[0]])
    len_filter_1+=len([item for item in max_itemsets if '8.0_Valence' in item[0]])
    len_filter_1+=len([item for item in max_itemsets if '9.0_Valence' in item[0]])
    len_filter_1+=len([item for item in max_itemsets if '10.0_Valence' in item[0]])

    filter_1.append( len_filter_1 )

    
plt.plot(filter_0, label='no val')
plt.plot(filter_1, label='val')



plt.legend(fontsize=15)
plt.xticks(fontsize=20)
plt.xlabel('%support', fontsize=15)

plt.show()

Association Rules¶

In [ ]:
rules = apriori(X_no_pol, target='r', supp=7, zmin=1, conf=75, report='aScl')
len(rules)
Out[ ]:
51
In [ ]:
rules_supp = []
rules_zmin = []
rules_conf = []

supp_range=[6,8,10,12,14]

zmin_range=[1,2,3,4,5]

conf_range=[50,60,70,80,90]

for i,j,k in zip(supp_range,zmin_range,conf_range):
    suppp = apriori(X, target='r', supp=i,conf=60)
    zminn = apriori(X, target='r', zmin=j,conf=60,supp=8)
    confi = apriori(X, target='r', conf=k,supp=8)
    rules_supp.append( len(suppp)  )
    rules_zmin.append( len(zminn) )
    rules_conf.append( len(confi) )
    
plt.plot(rules_supp, label='supp')
plt.plot(rules_zmin, label='zmin')
plt.plot(rules_conf, label='conf')
plt.legend(fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('iteration', fontsize=15)

plt.show()
In [ ]:
count=0
for i in range(len(rules)):
  if rules[i][5]>1.6:
    count+=1
    print(rules[i][1], rules[i][0], rules[i][5])
    print("\n")
print(count)
('0.0_Age_of_Aquisition', '3.0_Perceivability') 3.0_Familiarity 2.0291812958222906


('0.0_Age_of_Aquisition', '0.0_Lenght') 3.0_Familiarity 2.092629715676513


('0.0_Age_of_Aquisition', '2.0_Dominance') 3.0_Familiarity 2.15094549757535


('0.0_Age_of_Aquisition', '2.0_Valence') 3.0_Familiarity 2.0607110927721615


('0.0_Age_of_Aquisition',) 3.0_Familiarity 2.0683135425501837


('3.0_Valence', '2.0_Arousal') 2.0_Dominance 1.6851493862268596


('3.0_Valence', '2.0_SemSize') 2.0_Dominance 1.6761757612300272


('3.0_Perceivability', '7.0_Web_Corpus_Freq', '2.0_Valence') 3.0_Familiarity 1.7993229600606484


('3.0_Perceivability', '1.0_Age_of_Aquisition', '2.0_Dominance') 2.0_Valence 1.6206524147097918


('3.0_Perceivability', '0.0_Lenght', '2.0_Dominance') 2.0_Valence 1.6045936704181385


('3.0_Perceivability', '1.0_Masculinity') 2.0_Valence 1.6422164675005584


('3.0_Perceivability', '1.0_SemSize', '2.0_Dominance') 2.0_Valence 1.6413434838948537


('3.0_Perceivability', '1.0_SemSize', '1.0_Arousal') 2.0_Valence 1.6120887400184276


('3.0_Perceivability', '3.0_Familiarity', '2.0_Dominance') 2.0_Valence 1.621554705133399


('3.0_Perceivability', '3.0_Familiarity', '1.0_Arousal') 2.0_Valence 1.6759611065358988


('3.0_Perceivability', '2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.774768741956242


('3.0_Perceivability', '2.0_Dominance', '2.0_Masculinity') 2.0_Valence 1.6559708342796577


('7.0_Web_Corpus_Freq', '2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.739711516684282


('1.0_Age_of_Aquisition', '2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.6366788663663663


('1.0_Masculinity', '2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.6379451670168828


('1.0_SemSize', '3.0_Familiarity', '2.0_Dominance') 2.0_Valence 1.6112367991384702


('1.0_SemSize', '2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.6667974426038943


('1.0_SemSize', '2.0_Dominance', '1.0_Lenght') 2.0_Valence 1.6243252813720235


('3.0_Familiarity', '2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.6774946657183498


('2.0_Dominance', '1.0_Arousal', '2.0_Masculinity') 2.0_Valence 1.6309943211694184


('2.0_Dominance', '1.0_Arousal', '1.0_Lenght') 2.0_Valence 1.615561697383268


('2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.600871642034784


27
In [ ]:
dom1=[r for r in rules if r[0] == '1.0_Dominance']
In [ ]:
val2=[r for r in rules if r[0] == '2.0_Valence']
In [ ]:
print(dom1)
[('1.0_Dominance', ('1.0_Valence', '1.0_Perceivability', '1.0_Lenght'), 401, 8.56471593336181, 0.8285123966942148, 1.6865630614444844), ('1.0_Dominance', ('1.0_Valence', '1.0_Perceivability'), 528, 11.277231952157198, 0.822429906542056, 1.6741812271434375), ('1.0_Dominance', ('1.0_Valence', '1.0_SemSize', '1.0_Arousal'), 435, 9.29090132422042, 0.8285714285714286, 1.6866832298136647), ('1.0_Dominance', ('1.0_Valence', '1.0_SemSize', '1.0_Lenght'), 411, 8.778299871849637, 0.8187250996015937, 1.666639528841157), ('1.0_Dominance', ('1.0_Valence', '1.0_SemSize'), 555, 11.853908586074327, 0.8149779735682819, 1.65901168358552), ('1.0_Dominance', ('1.0_Valence', '2.0_Masculinity', '1.0_Arousal', '1.0_Lenght'), 405, 8.650149508756941, 0.805168986083499, 1.6390439968882358), ('1.0_Dominance', ('1.0_Valence', '2.0_Masculinity', '1.0_Arousal'), 552, 11.78983340452798, 0.8165680473372781, 1.6622485207100592), ('1.0_Dominance', ('1.0_Valence', '1.0_Arousal', '2.0_Familiarity'), 459, 9.8035027765912, 0.8038528896672504, 1.6363648823574202), ('1.0_Dominance', ('1.0_Valence', '1.0_Arousal', '1.0_Lenght'), 549, 11.725758222981632, 0.8133333333333334, 1.655663768115942), ('1.0_Dominance', ('1.0_Valence', '1.0_Arousal'), 751, 16.040153780435713, 0.8189749182115594, 1.667148072637618)]
In [ ]:
for el in range(len(dom1)):
  print( 'to_predict:', [r for r in rules if r[0] == '1.0_Dominance'][el][0])
  print( 'how?', [r for r in rules if r[0] == '1.0_Dominance'][el][1])
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_Perceivability', '1.0_Lenght')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_Perceivability')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_SemSize', '1.0_Arousal')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_SemSize', '1.0_Lenght')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_SemSize')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '2.0_Masculinity', '1.0_Arousal', '1.0_Lenght')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '2.0_Masculinity', '1.0_Arousal')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_Arousal', '2.0_Familiarity')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_Arousal', '1.0_Lenght')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_Arousal')
In [ ]:
for el in range(len(val2)):
  print( 'to_predict:', [r for r in rules if r[0] == '2.0_Valence'][el][0])
  print( 'how?', [r for r in rules if r[0] == '2.0_Valence'][el][1])
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_Age_of_Aquisition', '1.0_Arousal')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_Age_of_Aquisition', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_Age_of_Aquisition')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_SemSize')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '2.0_Dominance')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_Arousal', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_Arousal')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('3.0_Perceivability',)
to_predict: 2.0_Valence
how? ('1.0_Age_of_Aquisition', '1.0_SemSize', '2.0_Dominance')
to_predict: 2.0_Valence
how? ('1.0_Age_of_Aquisition', '2.0_Dominance', '1.0_Arousal')
to_predict: 2.0_Valence
how? ('1.0_SemSize', '2.0_Dominance', '1.0_Arousal')
to_predict: 2.0_Valence
how? ('1.0_SemSize', '2.0_Dominance', '2.0_Familiarity')
to_predict: 2.0_Valence
how? ('1.0_SemSize', '2.0_Dominance', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('1.0_SemSize', '2.0_Dominance')
to_predict: 2.0_Valence
how? ('6.0_Web_Corpus_Freq', '2.0_Dominance', '1.0_Arousal')
to_predict: 2.0_Valence
how? ('2.0_Dominance', '2.0_Masculinity', '1.0_Arousal', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('2.0_Dominance', '2.0_Masculinity', '1.0_Arousal')
to_predict: 2.0_Valence
how? ('2.0_Dominance', '1.0_Arousal', '2.0_Familiarity', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('2.0_Dominance', '1.0_Arousal', '2.0_Familiarity')
to_predict: 2.0_Valence
how? ('2.0_Dominance', '1.0_Arousal', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('2.0_Dominance', '1.0_Arousal')
In [ ]:
# <--
rules[0][0], rules[0][1]
Out[ ]:
('Not Polysemy', ('8_Lenght',))
In [ ]:
# asbolute support
rules[0][2]
Out[ ]:
484
In [ ]:
# support as percentage
rules[0][3]
Out[ ]:
10.337462622810763
In [ ]:
# confidence
rules[0][4]
Out[ ]:
0.983739837398374
In [ ]:
# lift
rules[0][5]
Out[ ]:
1.0703857584706453
In [ ]: